diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0116c1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ + +data/gene_name_info/query_full_name.txt +data/gene_name_info/query_ids.txt +data/gene_name_info/query_snps.txt +data/gene_name_info/query_symbol.txt +results/baseline_doc/pubmed.zinc.0.15.txt +results/baseline_doc/pubmed.zinc.1.15.txt diff --git a/REPRODUCING.md b/REPRODUCING.md new file mode 100644 index 0000000..3146537 --- /dev/null +++ b/REPRODUCING.md @@ -0,0 +1,36 @@ +This [Code Ocean](https://codeocean.com) Compute Capsule will allow you to reproduce the results published by the author on your local machine1. Follow the instructions below, or consult [our knowledge base](https://help.codeocean.com/user-manual/sharing-and-finding-published-capsules/exporting-capsules-and-reproducing-results-on-your-local-machine) for more information. Don't hesitate to reach out to [Support](mailto:support@codeocean.com) if you have any questions. + +1 You may need access to additional hardware and/or software licenses. + +# Prerequisites + +- [Docker Community Edition (CE)](https://www.docker.com/community-edition) +- [nvidia-container-runtime](https://docs.docker.com/config/containers/resource_constraints/#gpu) for code that leverages the GPU +- MATLAB/MOSEK/Stata licenses where applicable + +# Instructions + +## The computational environment (Docker image) + +This capsule is private and its environment cannot be downloaded at this time. You will need to rebuild the environment locally. + +> If there's any software requiring a license that needs to be run during the build stage, you'll need to make your license available. See [our knowledge base](https://help.codeocean.com/user-manual/sharing-and-finding-published-capsules/exporting-capsules-and-reproducing-results-on-your-local-machine) for more information. + +In your terminal, navigate to the folder where you've extracted the capsule and execute the following command: +```shell +cd environment && docker build . --tag 6ef700ed-ff07-4a42-bf13-65d4165511b6; cd .. +``` + +> This step will recreate the environment (i.e., the Docker image) locally, fetching and installing any required dependencies in the process. If any external resources have become unavailable for any reason, the environment will fail to build. + +## Running the capsule to reproduce the results + +In your terminal, navigate to the folder where you've extracted the capsule and execute the following command, adjusting parameters as needed: +```shell +docker run --platform linux/amd64 --rm --gpus all \ + --workdir /code \ + --volume "$PWD/data":/data \ + --volume "$PWD/code":/code \ + --volume "$PWD/results":/results \ + 6ef700ed-ff07-4a42-bf13-65d4165511b6 bash run +``` diff --git a/code/Extrinsic_application_CVD_prediction.py b/code/Extrinsic_application_CVD_prediction.py new file mode 100644 index 0000000..9617b75 --- /dev/null +++ b/code/Extrinsic_application_CVD_prediction.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 30 21:59:06 2022 + +@author: Jihye Moon +""" +import sys +import os +import pathlib + +import pandas as pd +import numpy as np + +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import StratifiedShuffleSplit as strata + +import lib.ML_models as ml +sys.path.append('lib') +import loading_literature_embedding as emb + +def data_split(X_train_index, X_test_index, X, y): + valid_data = int(len(X_test_index)/2) + test_data = int(len(X_test_index))-valid_data + + test = X_test_index[0:test_data]; valid = X_test_index[test_data:test_data+valid_data] + + X_train = X[X_train_index]; X_test = X[test]; X_valid = X[valid] + + y_train = y[X_train_index] + y_test = y[test] + y_valid = y[valid] + + X_train = np.reshape(X_train, (X_train.shape[0], -1)); X_test = np.reshape(X_test, (X_test.shape[0], -1)) + X_valid = np.reshape(X_valid, (X_valid.shape[0], -1)) + y_train = np.squeeze(y_train); y_test = np.squeeze(y_test); y_valid = np.squeeze(y_valid) + + scaler = StandardScaler() + scaler.fit(X_train) + X_train = scaler.transform(X_train); X_test = scaler.transform(X_test); X_valid = scaler.transform(X_valid) + return X_train, X_test, X_valid, y_train, y_test, y_valid + +def loading_variable_embedding(data_path): + var_symbol = list(pd.read_csv(data_path+'/variables_symbol.csv').drop(columns='Unnamed: 0')['0']) + var_name = list(pd.read_csv(data_path+'/variables_preprocessed_names.csv').drop(columns='Unnamed: 0')['0']) + tar_symbol = list(pd.read_csv(data_path+'/target_variables_symbol.csv').drop(columns='Unnamed: 0')['0']) + tar_name = list(pd.read_csv(data_path+'/target_variables_preprocessed_names.csv').drop(columns='Unnamed: 0')['0']) + + variables_indexing={}; disease_variables_indexing={} + + for i in range(len(var_name)): + variables_indexing[var_symbol[i]] = var_name[i] + + for i in range(len(tar_name)): + disease_variables_indexing[tar_symbol[i]] = tar_name[i] + + additional_dictionary = {'uricosurics':'uricosuric'} + # If some variable names are very unique that can't find in embedding vocabulary, + # add the unique variable names here to avoid error for feature selection tasks + + embedding_list, index2variables, embedding, removal, removed_words = emb2simi.variable2embed(words_list, syn0norm, variables_indexing, additional_dictionary) + + if removal==[]: + print(" === NO problem for your variables") + target_embedding_list, index2target, target_embedding, _, _ = emb2simi.variable2embed(words_list, syn0norm, disease_variables_indexing, additional_dictionary) + + return embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, \ + target_embedding_list, index2target, index2variables, target_embedding, embedding + else: + print(" === Check if there are errors for your variable names") + return 0, 0, 0, 0, 0, 0, 0, 0, 0 + +def CVD_Prediction_with_FS_DR(data_path, Xt, y): + feature_size = 128; i=0 + split_info = strata(n_splits=5, test_size=0.2, random_state=12) + total_FS_Pre=[]; total_FS_prob=[] + total_DR_pre=[]; total_DR_prob=[] + embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, target_embedding_list, index2target, index2variables, target_embedding, embedding = loading_variable_embedding(data_path) + for X_train_index, X_test_index in split_info.split(Xt.values, y): + result_dir = os.path.join(output_path +str(i)) + pathlib.Path(result_dir).mkdir(parents=True, exist_ok=True) + X_train, X_test, X_valid, y_train, y_test, y_valid = data_split(X_train_index, X_test_index, Xt.values, y) + pr.save_label(y_test, 'CVD_label', result_dir) # y_test labels to evaludate CVD prediction performance for each fold + print("=== run Our feature selector --- our FS selected features via feature name , our FS uses same feature set for 5-fold cross validation. ") + embed_name = fs.Our_FS(emb2simi, str(i)+'rf_embedding_features', embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, embedding, target_embedding_list, index2target, index2variables, target_embedding, feature_size, result_dir) + + print("=== run Our dimensionality reductor ") + A1, A2, A3 = dr.Our_DR(embedding, X_train, X_test, X_valid, feature_size) + + print("=== Running with MLs with Feature Selection (Our FS)") + X2 = Xt[embed_name].values ### selecting only 128 variables based on our 128 features + valid_data = int(len(X_test_index)/2); test_data = int(len(X_test_index))-valid_data + test = X_test_index[0:test_data]; valid = X_test_index[test_data:test_data+valid_data] # split test data + X_train2 = X2[X_train_index]; X_test2 = X2[test]; X_valid2 = X2[valid] + + X_train2 = np.reshape(X_train2, (X_train2.shape[0], -1)) + X_test2 = np.reshape(X_test2, (X_test2.shape[0], -1)) + X_valid2 = np.reshape(X_valid2, (X_valid2.shape[0], -1)) + + scaler = StandardScaler() + scaler.fit(X_train2) + X_train2 = scaler.transform(X_train2); X_test2 = scaler.transform(X_test2); X_valid2 = scaler.transform(X_valid2) + + Our_FS_total_prediction, Our_FS_total_prob = pr.run_save(X_train2, y_train, X_test2, y_test, X_valid2, y_valid, 'FS.embedding', 'SMOTE', feature_size, result_dir) + total_FS_Pre.append(Our_FS_total_prediction); total_FS_prob.append(Our_FS_total_prob) + print("=== Running MLs with Dimensionality Reduction (Our DR)") + Our_DR_total_prediction, Our_DR_total_prob = pr.run_save(A1, y_train, A2, y_test, A3, y_valid, 'DR.embedding', 'SMOTE', feature_size, result_dir) + total_DR_pre.append(Our_FS_total_prediction); total_DR_prob.append(Our_FS_total_prob) + i+=1 + print('all results are saved in ', output_path) + return total_FS_Pre, total_FS_prob, total_DR_pre, total_DR_prob + +data_path = '../data/Example' +model_path = '../data/old_model' +output_path = '../results/prediction/' + +fs = ml.feature_selectors() +dr = ml.dimension_reducers() +pr = ml.predictors() + +gene_name = '../data/gene_name_info/query_full_name'; gene_symb='../data/gene_name_info/query_symbol' +emb2simi=emb.embedding_vector() + +words_list, index2word, syn0norm, _ = emb2simi.setting(model_path, gene_symb) + +Xt = pd.read_csv(data_path+'/Example_X.csv').drop(columns='Unnamed: 0') +y = pd.read_csv(data_path+'/Example_y.csv').drop(columns='Unnamed: 0').values + +total_FS_Pre, total_FS_prob, total_DR_pre, total_DR_prob = CVD_Prediction_with_FS_DR(data_path, Xt, y) diff --git a/code/LICENSE b/code/LICENSE new file mode 100644 index 0000000..08320cf --- /dev/null +++ b/code/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Jihye Moon + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/code/README.md b/code/README.md new file mode 100644 index 0000000..c87cead --- /dev/null +++ b/code/README.md @@ -0,0 +1,266 @@ +#### **A Literature Embedding Model for Cardiovascular Disease Prediction using Risk Factors, Symptoms, and Genotype Information** +##### Authors: Jihye Moon, Hugo F. Posada-Quintero, and *Ki. H. Chon +**Contact address**: ki.chon@uconn.edu (*Corresponding author), jihye.moon@uconn.edu (Q&A for code). +(Accepted by Expert System with Application at August 24, 2022) + +### Contents + +This capsule aims to provide the implementation of 1) **Literature data collection and preprocessing**, and 2) **Literature embedding model training and evaluation**. The pre-trained Literature embedding model aims to identify CVD risk factors and associated information for a given input query (i.e., stroke). Also, since our literature embedding model contains representations for CVD-related words, our literature model can work as **Feature Selection (FS) and Dimensionality Reduction (DR) models on cohort data** for CVD prediction/classification tasks (extrinsic method). We used MESA cohort data consisting of 6,814 subjects and 564 variables in our manuscript. Since our cohort data requires permission, users are required to prepare their cohort data to use the literature embedding model as FS or DR tasks. The cohort data are required to have variables per subject and the variables' name. This capsule's guideline also provides a pipeline for the FS and DR for input cohort data. + +1. [Introduction](#introduction) +2. [Code Implementations and Guidelines](#gudelines) + 0. [DEMO](#demo) + 1. [DEMO A) CVD risk factors, genes, and associated information identifications](#demo1") + 2. [DEMO B) All steps to build a literature embedding model (data collection ~ model training](#demo2) + 3. [Reproduction DEMO](#default_demo) + 1. [Literature data collection](#collection) + 2. [Literature data preprocessing](#preprocessing) + 3. [Literature embedding model training](#training) + 4. [Literature embedding model evaluation](#evaluation) + 5. [FS and DR applications on cohort data](#applications) +3. [Results](#results) +4. [GitHub Source](#github) + +### 1. Introduction +Accurate prediction of cardiovascular disease (CVD) requires multifaceted information consisting of not only a patient’s medical history, but genomic data, symptoms, lifestyle, and risk factors which are often not incorporated into a decision-making process as the data are vast, difficult to obtain, and require complex algorithms. **Estimating CVD risk factors is now a significant goal for more accurate CVD prediction and treatment**. +##### Previous work's limitation +CVD risk factors can be identified from phenotype variables, genetic arrays, text, and image data. Several approaches have been introduced to identify CVD risk factors that are categorized as: (1) cohort-based CVD risk factor identification, and (2) literature-based CVD risk factor identification and information management. Category (1) enables objective validation of the identified risk factors using CVD patient data, but the number of available features is limited, which may limit the identification of new CVD risk factors. Category (2) enables the management of significant risk factors using publicly-available literature data, however, most methods were not validated using CVD patient data. Hence, **it is critical to developing a novel method to collect information on the risk factors, associated symptoms, and mechanisms, but it needs to be objectively validated using CVD patients to be relevant for better clinical diagnosis and treatment management.** +##### Our proposed work +In our paper, **we proposed a literature embedding model that trained using literature data freely accessible online.** Our model enables the retrieval of CVD risk factors, associated information, and genes independently from population-based data. Even though our literature model was trained using literature, our model enables selecting accurate CVD-related features from the population-based cohort data as FS and DR models, which involves better CVD prediction. + +### 2. Code implementation and guidelines +This section provides descriptions for [0. Demo](#demo) and details for [1. Literature data collection](#collection), [2. Literature data preprocessing](#preprocessing), [3. Literature embedding model training](#training), [4. Literature embedding model evaluation](#evaluation), and [5. FS and DR applications on cohort data](#applications). +DEMO shows an overall for our codes, and the other five subsections show the details of the codes for each purpose. + +We prepared five main codes for each goal: + 1) step1_data_collection.py, + 2) step2_data_preprocessing.py, + 3) step3_literature_embedding_training.py, + 4) step4_CVD_risk_factor_identification.py, + 5) Extrinsic_application_CVD_prediction.py. + +We feed different inputs to each main code for each purpose. Details are described below. + +#### 2.0. DEMO + +We prepared three DEMOs: + 1) **DEMO A**: It provides **CVD risk factors, genes, and associated information identifications** using a pre-trained literature model. + 2) **DEMO B**: It provides all steps for **literature data collection**, **literature data preprocessing**, and **literature embedding model training and intrinsic evaluation (CVD risk factor identifications)** + 3) **Reproduction DEMO**: It shows DEMO A's results and provides **literature embedding model training and evaluation steps**. + +In the CodeOcean platform, the DEMO A is the default. + +##### 2.0.1. DEMO A) CVD risk factors, genes, and associated information identifications +To run DEMO A, run the following command: +~~~~ {.sourceCode .shell} +./run.sh 'demo_a' +~~~~ + +The command imports our pre-trained literature embedding model at EMBEDDING_PATH='../data/old_model' and captures CVD risk factors and associated information for three queries ('stroke', 'atrial fibrillation, 'ventricular fibrillation'). +The input query-related risk factors, associated information, and gene names will be displayed and saved in STEP4_OUTPUT_PATH='../results/demo_a'. + +##### 2.0.2. DEMO B) All steps to build a literature embedding model (data collection ~ model training) +To run Demo-b, run the following command at **your local computer**: +~~~~ {.sourceCode .shell} +./run.sh 'demo_b' +~~~~ +Demo b) provides all steps for literature data collection & preprocessing, literature embedding model training & evaluation for CVD risk factor identifications. This DEMO B provides a limited number of collected literature data. When users want to get all data, users are required to set NUM_WORD_BASED_DATA=0 and NUM_GENE_BASED_DATA=0. + +~~~~ {.sourceCode .shell} +./run.sh 'demo_b' + echo 'demo b -- ' + QUERY_WORD='zinc' ## you can define query word to collect literature data + NUM_WORD_BASED_DATA=500000 #if NUM_WORD_BASED_DATA=0, it collects all possible gene-related literature + NUM_GENE_BASED_DATA=100 #if NUM_GENE_BASED_DATA=0, it collects all possible gene-related literature + BASE_PATH='../results/' + DATA_COLLECTION_PATH='../results/demo_b' + PREPROCESSEING_PATH='../results/demo_b' + EMBEDDING_NAME='pre_trained_demo' + EMBEDDING_PATH='../results/pre_trained_demo' + EPOCH=2 + STEP4_OUTPUT_PATH='../results/CVD_searches' + + python -u step1_data_collection.py $QUERY_WORD $NUM_WORD_BASED_DATA $NUM_GENE_BASED_DATA $DATA_COLLECTION_PATH + python -u step2_data_preprocessing.py $DATA_COLLECTION_PATH $PREPROCESSEING_PATH + python -u step3_literature_embedding_training.py $PREPROCESSEING_PATH $EPOCH $EMBEDDING_NAME + python -u step4_CVD_risk_factor_identification.py $EMBEDDING_NAME $STEP4_OUTPUT_PATH +~~~~ + +This DEMO B generates the collected literature data, the pre-processed literature data, and the trained literature embedding model at './results'. + +##### 2.0.3. Reproduction DEMO + +The reproduction DEMO is operated by the following command: +~~~~ {.sourceCode .shell} +./run.sh + +or + +./run.sh 'demo_r' +~~~~ + +This reproduction DEMO shows 1) CVD risk factor identifications using our paper's pre-trained literature model and 2) all steps for a literature model training process and risk factor searches using the newly pre-trained model. +We prepared a collected literature data set at PREPROCESSEING_PATH='../data/old_preprocessed_data' for 2). + +#### 2.1. Literature data collection +This subsection explains details for step1_data_collection.py. The code recieves four input: + +~~~~ {.sourceCode .shell} + QUERY_WORD='zinc' + NUM_WORD_BASED_DATA=0 + NUM_GENE_BASED_DATA=0 + DATA_COLLECTION_PATH='../results/$USER_DIFINED' + + python -u step1_data_collection.py $QUERY_WORD $NUM_WORD_BASED_DATA $NUM_GENE_BASED_DATA $DATA_COLLECTION_PATH +~~~~ + +In our manuscript, we collected 16k published literature from PubMed using search keywords consisting of a word (“heart”) and human gene names, then trained a literature embedding model using the collected abstracts. Below table 1. shows an example of collected abstracts by this code. + +*Table 1. An example of collected abstracts* +|
Document type
|
Keyword
|
Example
| +|:---|:---|:---| +|Keyword-based Literature from PubMed|Heart|Waist-to-hip ratio (WHR) is a strong predictor of mortality in patients with **heart** failure (HF). Left ventricular diastolic filling function has predictable maturational progression, with significant differences in the intraventricular pressure difference between infants from birth to 2 years. | +|Gene Name-based Literature from PubMed|HMGA1|**HMGA1** has been shown to regulate genes involved with systemic inflammatory processes. We hypothesized that **HMGA1** is important in the function of mesenchymal stromal cells, which are known to modulate inflammatory responses due to sepsis.| + +We can change a number of collectible documents: +~~~ +If NUM_WORD_BASED_DATA==0: + It collects all documents for $QUERY_WORD. +elif NUM_WORD_BASED_DATA==100000: + It collects 10,0000 documents for $QUERY_WORD. + +If NUM_GENE_BASED_DATA==0: + It collects all documents for $QUERY_WORD. +elif NUM_GENE_BASED_DATA==10: + It collects documents for 10*NUM_GENE_BASED_DATA gene names. +~~~ + +#### 2.2. Literature data preprocessing +This subsection explains details for step2_data_preprocessing.py. The code recieves two inputs: +~~~ + DATA_COLLECTION_PATH='../results/$USER_DIFINED' + PREPROCESSEING_PATH='../results/$USER_DIFINED' + + python -u step2_data_preprocessing.py $DATA_COLLECTION_PATH $PREPROCESSEING_PATH +~~~ + +*Table 2. An example of text preprocessing* +|
Document
|
Gene Name
| Sentence | +|:---|:---|:---| +| Original | HMGA1 | Mesenchymal stromal cells expressing a dominant-negative high mobility group A1 transgene exhibit improved function during sepsis. | +| Pre-processed | #HMGA1 | mesenchymal stromal cells expressing dominant-negative high mobility group a# transgene exhibit improved function sepsis | + + +#### 2.3. Literature embedding model training +This subsection explains details for step3_literature_embedding_training.py. The code recieves three inputs: +~~~ + EMBEDDING_PATH='../results/$MODEL_PATH' + EPOCH=2 # setting the number of ecoch for literature embedding model + + python -u step3_literature_embedding_training.py $PREPROCESSEING_PATH $EPOCH $EMBEDDING_PATH + ~~~ +EMBEDDING_PATH is embedding model path and EPOCH is the number of epoch. EPOCH=10 is recommanded. +Our literature embedding model trains literature representations by the following three steps. To train 'heart'-related literature, the model trains a basic skip-gram structure as shown in Fig.1(a). To train gene-name-related literature, the model uses Fig.(b) and (c) structures. + +Fig. 1. Skip-gram structure of Word2vec +| (a) step 1| (b) step 2| (c) step 3| +| :--- | :--- | :--- | +| ![image](read_me_images/model1_re.jpg)|![image](read_me_images/model2_re.jpg)|![image](read_me_images/model3_re.jpg)| +|Skip-gram structure to predict context words using a center word in the same document|Our proposed structure (1) to predict captured document's word contexts with gene name that used as search query |Our proposed structure (2) to predict gene-name-associated words in captured document using gene name| + +Users can set hyper-parameters in step3_literature_embedding_training.py: +~~~~ {step3_literature_embedding_training.py} + window_size = 2 # The number of contexts per center word for literature model training. Details are in our manuscript. + min_count = 5 # Words with Appreacnce frequency in the document is fewer than min_count = 5, + min_size = 2 # Words that have character size <= min_size = 3 will be excluded. + dimension = 128 # Embedding model's dimension + num_sampled = 16 # Negative sampling parameter + batch_size = 256 # +~~~~ + +#### 2.4. Literature embedding model evaluation (CVD risk factor searches) +This subsection explains details for step4_CVD_risk_factor_identification.py. The code recieves two inputs: + +~~~~ {.sourceCode .shell} + EMBEDDING_PATH='../results/$MODEL_PATH' + STEP4_OUTPUT_PATH='../results/$SEARCH_PATH' + python -u step4_CVD_risk_factor_identification.py $EMBEDDING_PATH $STEP4_OUTPUT_PATH +~~~~ + +Users can put their query in step4_CVD_risk_factor_identification.py like below: + +~~~~ {.sourceCode .python} +queries = ['stroke', 'atrial fibrillation', 'ventricular fibrillation'] #put your own query in [] +~~~~ + +#### 2.5. FS and DR applications on cohort data +This subsection explains details for Extrinsic_application_CVD_prediction.py. The code has three inputs in Extrinsic_application_CVD_prediction.py: + * data_path = '../data/Example' + * model_path = '../data/old_model' + * output_path = '../results/prediction/' + +Users are required to prepare cohort data, pre-trained embedding model paths, and output paths. After users run Extrinsic_application_CVD_prediction.py with the user's cohort data, users will get prediction results and label after our FS and DR processes for each K-fold at output_path. Then evaluate CVD prediction performance using performance_metrics.metric(label, prediction_results) in lib/performance_metrics.py. All other ML methods-FS, DT, H2FS, PCA, and UMAP are in lib/ML_models.py file. + +##### Cohort data format ###### + +Users are required to prepare cohort data with variable names. To show the format of input data, we generated example data Example_X (variables per subject) and Example_y (CVD labels per subject) using lib/ExpCohort_Generator.py. Details are in the lib/ExpCohort_Generator.py file. + +The **format** of input cohort data (Example_X) have should be like below: + +*Table 3. The data format example generated by ExpCohort_Generator.py (variable)* +| Subject | bca | nit | fhha | sbld | pulrate | +|-----|----------|----------|----------|----------|----------| +| 0 | 0.296735 | 0.292552 | 0.074269 | 0.886255 | 0.235104 | +| 1 | 0.699152 | 0.626459 | 0.917815 | 0.988134 | 0.167721 | +| 2 | 0.484408 | 0.327285 | 0.351393 | 0.946728 | 0.366808 | +| 3 | 0.970385 | 0.811354 | 0.068369 | 0.246754 | 0.198345 | +| .. | ... | ... | ... | ... | ... | +| N | 0.905146 | 0.855485 | 0.657306 | 0.385825 | 0.957396 | + +The **format** of CVD label per subject (Example_y) should be like below: + +*Table 4. The cohort data format example generated by ExpCohort_Generator.py (label)* +|
Subject
|
CVD (Yes=1, no=0)
| +|:---|:---| +| 1 | 0 | +| 2 | 1 | +| 3 | 1 | +| ... | ... | +| N | 0 | + +### 3. Results + +In our manuscript, we used three queries ('stroke', 'atrial fibrillation, 'ventricular fibrillation') for CVD risk factor identifications. We analyzed whether or not the captured words and genes were correctly identified as risk factors and associated symptoms for the input query words. Our model accurately (average accuracy of >96%) captured associated risk factors, symptoms, and genes for a given input query word. Details are described in our published manuscript. + +We also used our embedding model as FS and DR tasks on cohort data for CVD prediction. Our FS and DR method provides better performance with the fastest computation time when compared with other popular FS and DR methods - Random Forest, Decision Tree, H2FS, UMAP, and PCA. + +Our model has the potential to facilitate easier collation of multifaceted information for better data mining of vast publicly available data so that efficient and accurate risk factors and symptoms can be identified, which helps better-informed decisions for CVD prediction and treatment. + +### 4. GitHub Source +------------- + +This project is also hosted on GitHub ([link](https://github.com/JihyeMooon/CVD_literature_embedding)) and is actively developed. + +### Error note +In the literature data collection process, some errors can happen due to network connecions. + +if you get errors at 25/33 point from 'collecting_doc_using_word_based_query' like below: +~~~ + 25 / 33 + Going to download records from 1250001 to 1260000 + Going to download records from 1260001 to 1270000 + + raise HTTPError(req.full_url, code, msg, hdrs, fp) + or IncompleteRead: IncompleteRead(20458171 bytes read) +~~~ +Then run collecting_doc_using_word_based_query agian, with 'w2d_starting_point = 25' + +If you have problems from 'collecting_doc_using_gene_based_query' like below: +~~~ + Example: if we get error at 5 / 2634 +~~~ +Then run collecting_doc_using_gene_based_query agian, with 'g2d_starting_point = 5' + + + + \ No newline at end of file diff --git a/code/gene_extraction.py b/code/gene_extraction.py new file mode 100644 index 0000000..c90b3b9 --- /dev/null +++ b/code/gene_extraction.py @@ -0,0 +1,134 @@ +from Bio import Entrez +from Bio import SeqIO +import time +from urllib.error import HTTPError +from http.client import IncompleteRead + +# Set your email address for Entrez +Entrez.email = "lrmercadod@gmail.com" +Entrez.api_key = "f095f0c0aad9480d90ee0b869acb43670d08" + +# Search for human genes in the Nucleotide database +handle = Entrez.esearch(db="gene", term="Homo sapiens[Organism]", retmax=10000000) +human_record = Entrez.read(handle) +handle.close() + +# Search for human ZIP11 gene +handle = Entrez.esearch(db="gene", term="ZIP11 AND Homo sapiens[Organism]", retmax=10000000) +human_zip11_record = Entrez.read(handle) +handle.close() + +# Search for mouse ZIP11 gene +handle = Entrez.esearch(db="gene", term="ZIP11 AND Mus musculus[Organism]", retmax=10000000) +mouse_zip11_record = Entrez.read(handle) +handle.close() + +# Get the list of gene IDs +human_gene_ids = human_record["IdList"] +human_zip11_ids = human_zip11_record["IdList"] +mouse_zip11_ids = mouse_zip11_record["IdList"] + +# Combine all gene IDs +gene_ids = human_gene_ids + human_zip11_ids + mouse_zip11_ids + +# Open the output files +symbol_file = open("query_symbol.txt", "a", encoding="utf-8") # Append mode +id_file = open("query_ids.txt", "a", encoding="utf-8") # Append mode +full_name_file = open("query_full_name.txt", "a", encoding="utf-8") # Append mode +snp_file = open("query_snps.txt", "a", encoding="utf-8") # Append mode +error_file = open("error_log.txt", "a", encoding="utf-8") # Append mode for error logging + +max_retries = 5 +retry_delay = 2 +batch_size = 500 +batch_delay = 2 + +# Load the last processed batch from the checkpoint file +checkpoint_file = "checkpoint.txt" +try: + with open(checkpoint_file, "r") as file: + last_processed_batch = int(file.read()) +except FileNotFoundError: + last_processed_batch = 0 + +# Iterate over the gene IDs in batches and fetch the gene information +for i in range(last_processed_batch * batch_size, len(gene_ids), batch_size): + batch_ids = gene_ids[i:i+batch_size] + + for gene_id in batch_ids: + retries = 0 + while retries < max_retries: + try: + handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml") + gene_record = Entrez.read(handle) + handle.close() + break + except (HTTPError, IncompleteRead) as e: + print(f"Error: {str(e)}. Retrying...") + retries += 1 + time.sleep(retry_delay) + else: + print(f"Failed to fetch gene information for gene ID: {gene_id}") + continue + + # Extract the relevant information + if "Entrezgene_gene" in gene_record[0] and "Gene-ref" in gene_record[0]["Entrezgene_gene"]: + gene_ref = gene_record[0]["Entrezgene_gene"]["Gene-ref"] + gene_symbol = gene_ref.get("Gene-ref_locus", "") + gene_full_name = gene_ref.get("Gene-ref_desc", "") + else: + gene_symbol = "" + gene_full_name = "" + + # Retrieve SNP information for the gene + retries = 0 + while retries < max_retries: + try: + handle = Entrez.elink(dbfrom="gene", db="snp", id=gene_id) + snp_record = Entrez.read(handle) + handle.close() + + if snp_record[0]["LinkSetDb"]: + snp_ids = [link["Id"] for link in snp_record[0]["LinkSetDb"][0]["Link"]] + for snp_id in snp_ids: + try: + snp_file.write(str(snp_id) + "\n") + except OSError as e: + error_file.write(f"Error writing SNP ID {snp_id} for gene ID {gene_id}: {str(e)}\n") + else: + try: + snp_file.write("N/A\n") + except OSError as e: + error_file.write(f"Error writing 'N/A' to snp_file for gene ID {gene_id}: {str(e)}\n") + break + except (IndexError, RuntimeError, IncompleteRead) as e: + print(f"Error retrieving SNP information for gene ID: {gene_id}. Retrying...") + retries += 1 + time.sleep(retry_delay) + else: + print(f"Failed to retrieve SNP information for gene ID: {gene_id}") + try: + snp_file.write("N/A\n") + except OSError as e: + error_file.write(f"Error writing 'N/A' to snp_file for gene ID {gene_id}: {str(e)}\n") + + # Write the information to the respective files + symbol_file.write(gene_symbol + "\n") + id_file.write(gene_id + "\n") + full_name_file.write(gene_full_name + "\n") + + # Update the checkpoint file with the last processed batch + with open(checkpoint_file, "w") as file: + file.write(str(i // batch_size)) + + print(f"Processed batch {i//batch_size + 1} of {len(gene_ids)//batch_size + 1}") + time.sleep(batch_delay) + +# Close the output files +symbol_file.close() +id_file.close() +full_name_file.close() +snp_file.close() +error_file.close() + +print("Gene extraction completed.") \ No newline at end of file diff --git a/code/lib/Bio/Affy/CelFile.py b/code/lib/Bio/Affy/CelFile.py new file mode 100644 index 0000000..ee95b0d --- /dev/null +++ b/code/lib/Bio/Affy/CelFile.py @@ -0,0 +1,502 @@ +# Copyright 2004 by Harry Zuzan. All rights reserved. +# Copyright 2016 by Adam Kurkiewicz. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Reading information from Affymetrix CEL files version 3 and 4.""" + + +import struct + +try: + import numpy +except ImportError: + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError( + "Install NumPy if you want to use Bio.Affy.CelFile" + ) from None + + +class ParserError(ValueError): + """Affymetrix parser error.""" + + def __init__(self, *args): + """Initialise class.""" + super().__init__(*args) + + +class Record: + """Stores the information in a cel file. + + Example usage: + + >>> from Bio.Affy import CelFile + >>> with open("Affy/affy_v3_example.CEL") as handle: + ... c = CelFile.read(handle) + ... + >>> print(c.ncols, c.nrows) + 5 5 + >>> print(c.intensities) + [[ 234. 170. 22177. 164. 22104.] + [ 188. 188. 21871. 168. 21883.] + [ 188. 193. 21455. 198. 21300.] + [ 188. 182. 21438. 188. 20945.] + [ 193. 20370. 174. 20605. 168.]] + >>> print(c.stdevs) + [[ 24. 34.5 2669. 19.7 3661.2] + [ 29.8 29.8 2795.9 67.9 2792.4] + [ 29.8 88.7 2976.5 62. 2914.5] + [ 29.8 76.2 2759.5 49.2 2762. ] + [ 38.8 2611.8 26.6 2810.7 24.1]] + >>> print(c.npix) + [[25 25 25 25 25] + [25 25 25 25 25] + [25 25 25 25 25] + [25 25 25 25 25] + [25 25 25 25 25]] + + """ + + def __init__(self): + """Initialize the class.""" + self.version = None + self.GridCornerUL = None + self.GridCornerUR = None + self.GridCornerLR = None + self.GridCornerLL = None + self.DatHeader = None + self.Algorithm = None + self.AlgorithmParameters = None + self.NumberCells = None + self.intensities = None + self.stdevs = None + self.npix = None + self.nrows = None + self.ncols = None + self.nmask = None + self.mask = None + self.noutliers = None + self.outliers = None + self.modified = None + + +def read(handle, version=None): + """Read Affymetrix CEL file and return Record object. + + CEL files format versions 3 and 4 are supported. + Please specify the CEL file format as 3 or 4 if known for the version + argument. If the version number is not specified, the parser will attempt + to detect the version from the file contents. + + The Record object returned by this function stores the intensities from + the CEL file in record.intensities. + Currently, record.mask and record.outliers are not set in when parsing + version 4 CEL files. + + Example Usage: + + >>> from Bio.Affy import CelFile + >>> with open("Affy/affy_v3_example.CEL") as handle: + ... record = CelFile.read(handle) + ... + >>> record.version == 3 + True + >>> print("%i by %i array" % record.intensities.shape) + 5 by 5 array + + >>> with open("Affy/affy_v4_example.CEL", "rb") as handle: + ... record = CelFile.read(handle, version=4) + ... + >>> record.version == 4 + True + >>> print("%i by %i array" % record.intensities.shape) + 5 by 5 array + + """ + try: + data = handle.read(0) + except AttributeError: + raise ValueError("handle should be a file handle") from None + data = handle.read(4) + if not data: + raise ValueError("Empty file.") + if data == b"[CEL": + raise ValueError("CEL file in version 3 format should be opened in text mode") + if data == "[CEL": + # Version 3 format. Continue to read the header here before passing + # control to _read_v3 to avoid having to seek to the beginning of + # the file. + data += next(handle) + if data.strip() != "[CEL]": + raise ValueError("Failed to parse Affy Version 3 CEL file.") + line = next(handle) + keyword, value = line.split("=", 1) + if keyword != "Version": + raise ValueError("Failed to parse Affy Version 3 CEL file.") + version = int(value) + if version != 3: + raise ValueError("Incorrect version number in Affy Version 3 CEL file.") + return _read_v3(handle) + try: + magicNumber = struct.unpack(" max_size: + max_atoms = [atom] + max_size = atom_dict[atom] + elif atom_dict[atom] == max_size: + max_atoms.append(atom) + + if require_multiple and num_atoms == 1: + consensus += ambiguous + elif (len(max_atoms) == 1) and ( + (float(max_size) / float(num_atoms)) >= threshold + ): + consensus += max_atoms[0] + else: + consensus += ambiguous + + return Seq(consensus) + + def gap_consensus(self, threshold=0.7, ambiguous="X", require_multiple=False): + """Output a fast consensus sequence of the alignment, allowing gaps. + + Same as dumb_consensus(), but allows gap on the output. + + Things to do: + - Let the user define that with only one gap, the result + character in consensus is gap. + - Let the user select gap character, now + it takes the same as input. + + """ + consensus = "" + + # find the length of the consensus we are creating + con_len = self.alignment.get_alignment_length() + + # go through each seq item + for n in range(con_len): + # keep track of the counts of the different atoms we get + atom_dict = {} + num_atoms = 0 + + for record in self.alignment: + # make sure we haven't run past the end of any sequences + # if they are of different lengths + if n < len(record.seq): + if record.seq[n] not in atom_dict: + atom_dict[record.seq[n]] = 1 + else: + atom_dict[record.seq[n]] += 1 + + num_atoms += 1 + + max_atoms = [] + max_size = 0 + + for atom in atom_dict: + if atom_dict[atom] > max_size: + max_atoms = [atom] + max_size = atom_dict[atom] + elif atom_dict[atom] == max_size: + max_atoms.append(atom) + + if require_multiple and num_atoms == 1: + consensus += ambiguous + elif (len(max_atoms) == 1) and ( + (float(max_size) / float(num_atoms)) >= threshold + ): + consensus += max_atoms[0] + else: + consensus += ambiguous + + return Seq(consensus) + + def replacement_dictionary(self, skip_chars=None, letters=None): + """Generate a replacement dictionary to plug into a substitution matrix. + + This should look at an alignment, and be able to generate the number + of substitutions of different residues for each other in the + aligned object. + + Will then return a dictionary with this information:: + + {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} + + This also treats weighted sequences. The following example shows how + we calculate the replacement dictionary. Given the following + multiple sequence alignment:: + + GTATC 0.5 + AT--C 0.8 + CTGTC 1.0 + + For the first column we have:: + + ('A', 'G') : 0.5 * 0.8 = 0.4 + ('C', 'G') : 0.5 * 1.0 = 0.5 + ('A', 'C') : 0.8 * 1.0 = 0.8 + + We then continue this for all of the columns in the alignment, summing + the information for each substitution in each column, until we end + up with the replacement dictionary. + + Arguments: + - skip_chars - Not used; setting it to anything other than None + will raise a ValueError + - letters - An iterable (e.g. a string or list of characters to include. + """ + if skip_chars is not None: + raise ValueError( + "argument skip_chars has been deprecated; instead, please use 'letters' to specify the characters you want to include" + ) + rep_dict = {(letter1, letter2): 0 for letter1 in letters for letter2 in letters} + + # iterate through each record + for rec_num1 in range(len(self.alignment)): + # iterate through each record from one beyond the current record + # to the end of the list of records + for rec_num2 in range(rec_num1 + 1, len(self.alignment)): + # for each pair of records, compare the sequences and add + # the pertinent info to the dictionary + self._pair_replacement( + self.alignment[rec_num1].seq, + self.alignment[rec_num2].seq, + self.alignment[rec_num1].annotations.get("weight", 1.0), + self.alignment[rec_num2].annotations.get("weight", 1.0), + rep_dict, + letters, + ) + + return rep_dict + + def _pair_replacement(self, seq1, seq2, weight1, weight2, dictionary, letters): + """Compare two sequences and generate info on the replacements seen (PRIVATE). + + Arguments: + - seq1, seq2 - The two sequences to compare. + - weight1, weight2 - The relative weights of seq1 and seq2. + - dictionary - The dictionary containing the starting replacement + info that we will modify. + - letters - A list of characters to include when calculating replacements. + + """ + # loop through each residue in the sequences + for residue1, residue2 in zip(seq1, seq2): + if residue1 in letters and residue2 in letters: + dictionary[(residue1, residue2)] += weight1 * weight2 + + def _get_all_letters(self): + """Return a string containing the expected letters in the alignment (PRIVATE).""" + set_letters = set() + for record in self.alignment: + set_letters.update(record.seq) + list_letters = sorted(set_letters) + all_letters = "".join(list_letters) + return all_letters + + def pos_specific_score_matrix(self, axis_seq=None, chars_to_ignore=None): + """Create a position specific score matrix object for the alignment. + + This creates a position specific score matrix (pssm) which is an + alternative method to look at a consensus sequence. + + Arguments: + - chars_to_ignore - A list of all characters not to include in + the pssm. + - axis_seq - An optional argument specifying the sequence to + put on the axis of the PSSM. This should be a Seq object. If nothing + is specified, the consensus sequence, calculated with default + parameters, will be used. + + Returns: + - A PSSM (position specific score matrix) object. + + """ + # determine all of the letters we have to deal with + all_letters = self._get_all_letters() + assert all_letters + + if chars_to_ignore is None: + chars_to_ignore = [] + if not isinstance(chars_to_ignore, list): + raise TypeError("chars_to_ignore should be a list.") + + gap_char = "-" + chars_to_ignore.append(gap_char) + + for char in chars_to_ignore: + all_letters = all_letters.replace(char, "") + + if axis_seq: + left_seq = axis_seq + assert len(axis_seq) == self.alignment.get_alignment_length() + else: + left_seq = self.dumb_consensus() + + pssm_info = [] + # now start looping through all of the sequences and getting info + for residue_num in range(len(left_seq)): + score_dict = dict.fromkeys(all_letters, 0) + for record in self.alignment: + try: + this_residue = record.seq[residue_num] + # if we hit an index error we've run out of sequence and + # should not add new residues + except IndexError: + this_residue = None + + if this_residue and this_residue not in chars_to_ignore: + weight = record.annotations.get("weight", 1.0) + try: + score_dict[this_residue] += weight + except KeyError: + raise ValueError( + "Residue %s not found" % this_residue + ) from None + + pssm_info.append((left_seq[residue_num], score_dict)) + + return PSSM(pssm_info) + + def information_content( + self, + start=0, + end=None, + e_freq_table=None, + log_base=2, + chars_to_ignore=None, + pseudo_count=0, + ): + """Calculate the information content for each residue along an alignment. + + Arguments: + - start, end - The starting an ending points to calculate the + information content. These points should be relative to the first + sequence in the alignment, starting at zero (ie. even if the 'real' + first position in the seq is 203 in the initial sequence, for + the info content, we need to use zero). This defaults to the entire + length of the first sequence. + - e_freq_table - A dictionary specifying the expected frequencies + for each letter (e.g. {'G' : 0.4, 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). + Gap characters should not be included, since these should not have + expected frequencies. + - log_base - The base of the logarithm to use in calculating the + information content. This defaults to 2 so the info is in bits. + - chars_to_ignore - A listing of characters which should be ignored + in calculating the info content. Defaults to none. + + Returns: + - A number representing the info content for the specified region. + + Please see the Biopython manual for more information on how information + content is calculated. + + """ + # if no end was specified, then we default to the end of the sequence + if end is None: + end = len(self.alignment[0].seq) + if chars_to_ignore is None: + chars_to_ignore = [] + + if start < 0 or end > len(self.alignment[0].seq): + raise ValueError( + "Start (%s) and end (%s) are not in the range %s to %s" + % (start, end, 0, len(self.alignment[0].seq)) + ) + # determine random expected frequencies, if necessary + random_expected = None + # determine all of the letters we have to deal with + all_letters = self._get_all_letters() + for char in chars_to_ignore: + all_letters = all_letters.replace(char, "") + + info_content = {} + for residue_num in range(start, end): + freq_dict = self._get_letter_freqs( + residue_num, + self.alignment, + all_letters, + chars_to_ignore, + pseudo_count, + e_freq_table, + random_expected, + ) + # print(freq_dict, end="") + column_score = self._get_column_info_content( + freq_dict, e_freq_table, log_base, random_expected + ) + info_content[residue_num] = column_score + # sum up the score + total_info = sum(info_content.values()) + # fill in the ic_vector member: holds IC for each column + # reset ic_vector to empty list at each call + self.ic_vector = [] + for (i, k) in enumerate(info_content): + self.ic_vector.append(info_content[i + start]) + return total_info + + def _get_letter_freqs( + self, + residue_num, + all_records, + letters, + to_ignore, + pseudo_count=0, + e_freq_table=None, + random_expected=None, + ): + """Determine the frequency of specific letters in the alignment (PRIVATE). + + Arguments: + - residue_num - The number of the column we are getting frequencies + from. + - all_records - All of the SeqRecords in the alignment. + - letters - The letters we are interested in getting the frequency + for. + - to_ignore - Letters we are specifically supposed to ignore. + - pseudo_count - Optional argument specifying the Pseudo count (k) + to add in order to prevent a frequency of 0 for a letter. + - e_freq_table - An optional argument specifying a dictionary with + the expected frequencies for each letter. + - random_expected - Optional argument that specify the frequency to use + when e_freq_table is not defined. + + This will calculate the frequencies of each of the specified letters + in the alignment at the given frequency, and return this as a + dictionary where the keys are the letters and the values are the + frequencies. Pseudo count can be added to prevent a null frequency + """ + freq_info = dict.fromkeys(letters, 0) + + total_count = 0 + + gap_char = "-" + + if pseudo_count < 0: + raise ValueError( + "Positive value required for pseudo_count, %s provided" % (pseudo_count) + ) + + # collect the count info into the dictionary for all the records + for record in all_records: + try: + if record.seq[residue_num] not in to_ignore: + weight = record.annotations.get("weight", 1.0) + freq_info[record.seq[residue_num]] += weight + total_count += weight + except KeyError: + raise ValueError( + "Residue %s not found in letters %s" + % (record.seq[residue_num], letters) + ) from None + + if e_freq_table: + # check if all the residus in freq_info are in e_freq_table + for key in freq_info: + if key != gap_char and key not in e_freq_table: + raise ValueError("%s not found in expected frequency table" % key) + + if total_count == 0: + # This column must be entirely ignored characters + for letter in freq_info: + assert freq_info[letter] == 0 + # TODO - Map this to NA or NaN? + else: + # now convert the counts into frequencies + for letter in freq_info: + if pseudo_count and (random_expected or e_freq_table): + # use either the expected random freq or the + if e_freq_table: + ajust_freq = e_freq_table[letter] + else: + ajust_freq = random_expected + + ajusted_letter_count = freq_info[letter] + ajust_freq * pseudo_count + ajusted_total = total_count + pseudo_count + freq_info[letter] = ajusted_letter_count / ajusted_total + + else: + freq_info[letter] = freq_info[letter] / total_count + + return freq_info + + def _get_column_info_content( + self, obs_freq, e_freq_table, log_base, random_expected + ): + """Calculate the information content for a column (PRIVATE). + + Arguments: + - obs_freq - The frequencies observed for each letter in the column. + - e_freq_table - An optional argument specifying a dictionary with + the expected frequencies for each letter. + - log_base - The base of the logarithm to use in calculating the + info content. + + """ + gap_char = "-" + + if e_freq_table: + # check the expected freq information to make sure it is good + for key in obs_freq: + if key != gap_char and key not in e_freq_table: + raise ValueError( + f"Frequency table provided does not contain observed letter {key}" + ) + + total_info = 0.0 + + for letter in obs_freq: + inner_log = 0.0 + # if we have expected frequencies, modify the log value by them + # gap characters do not have expected frequencies, so they + # should just be the observed frequency. + if letter != gap_char: + if e_freq_table: + inner_log = obs_freq[letter] / e_freq_table[letter] + else: + inner_log = obs_freq[letter] / random_expected + # if the observed frequency is zero, we don't add any info to the + # total information content + if inner_log > 0: + letter_info = ( + obs_freq[letter] * math.log(inner_log) / math.log(log_base) + ) + total_info += letter_info + return total_info + + def get_column(self, col): + """Return column of alignment.""" + # TODO - Deprecate this and implement slicing? + return self.alignment[:, col] + + +class PSSM: + """Represent a position specific score matrix. + + This class is meant to make it easy to access the info within a PSSM + and also make it easy to print out the information in a nice table. + + Let's say you had an alignment like this:: + + GTATC + AT--C + CTGTC + + The position specific score matrix (when printed) looks like:: + + G A T C + G 1 1 0 1 + T 0 0 3 0 + A 1 1 0 0 + T 0 0 2 0 + C 0 0 0 3 + + You can access a single element of the PSSM using the following:: + + your_pssm[sequence_number][residue_count_name] + + For instance, to get the 'T' residue for the second element in the + above alignment you would need to do: + + your_pssm[1]['T'] + """ + + def __init__(self, pssm): + """Initialize with pssm data to represent. + + The pssm passed should be a list with the following structure: + + list[0] - The letter of the residue being represented (for instance, + from the example above, the first few list[0]s would be GTAT... + list[1] - A dictionary with the letter substitutions and counts. + """ + self.pssm = pssm + + def __getitem__(self, pos): + return self.pssm[pos][1] + + def __str__(self): + out = " " + all_residues = sorted(self.pssm[0][1]) + + # first print out the top header + for res in all_residues: + out += " %s" % res + out += "\n" + + # for each item, write out the substitutions + for item in self.pssm: + out += "%s " % item[0] + for res in all_residues: + out += " %.1f" % item[1][res] + + out += "\n" + return out + + def get_residue(self, pos): + """Return the residue letter at the specified position.""" + return self.pssm[pos][0] + + +def print_info_content(summary_info, fout=None, rep_record=0): + """3 column output: position, aa in representative sequence, ic_vector value.""" + fout = fout or sys.stdout + if not summary_info.ic_vector: + summary_info.information_content() + rep_sequence = summary_info.alignment[rep_record].seq + for pos, ic in enumerate(summary_info.ic_vector): + fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos], ic)) diff --git a/code/lib/Bio/Align/Applications/_ClustalOmega.py b/code/lib/Bio/Align/Applications/_ClustalOmega.py new file mode 100644 index 0000000..2181bc5 --- /dev/null +++ b/code/lib/Bio/Align/Applications/_ClustalOmega.py @@ -0,0 +1,269 @@ +# Copyright 2011 by Andreas Wilm. All rights reserved. +# Based on ClustalW wrapper copyright 2009 by Cymon J. Cox. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple alignment program Clustal Omega.""" + + +from Bio.Application import _Option, _Switch, AbstractCommandline + + +class ClustalOmegaCommandline(AbstractCommandline): + """Command line wrapper for clustal omega. + + http://www.clustal.org/omega + + Notes + ----- + Last checked against version: 1.2.0 + + References + ---------- + Sievers F, Wilm A, Dineen DG, Gibson TJ, Karplus K, Li W, Lopez R, + McWilliam H, Remmert M, Söding J, Thompson JD, Higgins DG (2011). + Fast, scalable generation of high-quality protein multiple + sequence alignments using Clustal Omega. + Molecular Systems Biology 7:539 https://doi.org/10.1038/msb.2011.75 + + Examples + -------- + >>> from Bio.Align.Applications import ClustalOmegaCommandline + >>> in_file = "unaligned.fasta" + >>> out_file = "aligned.fasta" + >>> clustalomega_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True) + >>> print(clustalomega_cline) + clustalo -i unaligned.fasta -o aligned.fasta --auto -v + + You would typically run the command line with clustalomega_cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + """ + + def __init__(self, cmd="clustalo", **kwargs): + """Initialize the class.""" + # order parameters in the same order as clustalo --help + self.parameters = [ + # Sequence Input + _Option( + ["-i", "--in", "--infile", "infile"], + "Multiple sequence input file", + filename=True, + equate=False, + ), + _Option( + ["--hmm-in", "HMM input", "hmm_input"], + "HMM input files", + filename=True, + equate=False, + ), + _Switch(["--dealign", "dealign"], "Dealign input sequences"), + _Option( + ["--profile1", "--p1", "profile1"], + "Pre-aligned multiple sequence file (aligned columns will be kept fix).", + filename=True, + equate=False, + ), + _Option( + ["--profile2", "--p2", "profile2"], + "Pre-aligned multiple sequence file (aligned columns will be kept fix).", + filename=True, + equate=False, + ), + _Option( + ["-t", "--seqtype", "seqtype"], + "{Protein, RNA, DNA} Force a sequence type (default: auto).", + equate=False, + checker_function=lambda x: x + in ["protein", "rna", "dna", "Protein", "RNA", "DNA", "PROTEIN"], + ), + _Switch( + ["--is-profile", "isprofile"], + "disable check if profile, force profile (default no)", + ), + _Option( + ["--infmt", "infmt"], + """Forced sequence input file format (default: auto) + + Allowed values: a2m, fa[sta], clu[stal], msf, phy[lip], selex, st[ockholm], vie[nna] + """, + equate=False, + checker_function=lambda x: x + in [ + "a2m", + "fa", + "fasta", + "clu", + "clustal", + "msf", + "phy", + "phylip", + "selex", + "st", + "stockholm", + "vie", + "vienna", + ], + ), + # Clustering + _Option( + ["--distmat-in", "distmat_in"], + "Pairwise distance matrix input file (skips distance computation).", + filename=True, + equate=False, + ), + _Option( + ["--distmat-out", "distmat_out"], + "Pairwise distance matrix output file.", + filename=True, + equate=False, + ), + _Option( + ["--guidetree-in", "guidetree_in"], + "Guide tree input file (skips distance computation and guide-tree clustering step).", + filename=True, + equate=False, + ), + _Option( + ["--guidetree-out", "guidetree_out"], + "Guide tree output file.", + filename=True, + equate=False, + ), + _Switch( + ["--full", "distmat_full"], + "Use full distance matrix for guide-tree calculation (slow; mBed is default)", + ), + _Switch( + ["--full-iter", "distmat_full_iter"], + "Use full distance matrix for guide-tree calculation during iteration (mBed is default)", + ), + _Option( + ["--cluster-size", "clustersize"], + "soft maximum of sequences in sub-clusters", + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["--clustering-out", "clusteringout"], + "Clustering output file", + filename=True, + ), + _Switch( + ["--use-kimura", "usekimura"], + "use Kimura distance correction for aligned sequences (default no)", + ), + _Switch( + ["--percent-id", "percentid"], + "convert distances into percent identities (default no)", + ), + # Alignment Output + _Option( + ["-o", "--out", "--outfile", "outfile"], + "Multiple sequence alignment output file (default: stdout).", + filename=True, + equate=False, + ), + _Option( + ["--outfmt", "outfmt"], + "MSA output file format:" + " a2m=fa[sta],clu[stal],msf,phy[lip],selex,st[ockholm],vie[nna]" + " (default: fasta).", + equate=False, + checker_function=lambda x: x + in [ + "a2m", + "fa", + "fasta", + "clu", + "clustal", + "msf", + "phy", + "phylip", + "selex", + "st", + "stockholm", + "vie", + "vienna", + ], + ), + _Switch( + ["--residuenumber", "--resno", "residuenumber"], + "in Clustal format print residue numbers (default no)", + ), + _Option( + ["--wrap", "wrap"], + "number of residues before line-wrap in output", + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["--output-order", "outputorder"], + "MSA output order like in input/guide-tree", + checker_function=lambda x: x in ["input-order", "tree-order"], + ), + # Iteration + _Option( + ["--iterations", "--iter", "iterations"], + "Number of (combined guide-tree/HMM) iterations", + equate=False, + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["--max-guidetree-iterations", "max_guidetree_iterations"], + "Maximum number of guidetree iterations", + equate=False, + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["--max-hmm-iterations", "max_hmm_iterations"], + "Maximum number of HMM iterations", + equate=False, + checker_function=lambda x: isinstance(x, int), + ), + # Limits (will exit early, if exceeded): + _Option( + ["--maxnumseq", "maxnumseq"], + "Maximum allowed number of sequences", + equate=False, + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["--maxseqlen", "maxseqlen"], + "Maximum allowed sequence length", + equate=False, + checker_function=lambda x: isinstance(x, int), + ), + # Miscellaneous: + _Switch( + ["--auto", "auto"], + "Set options automatically (might overwrite some of your options)", + ), + _Option( + ["--threads", "threads"], + "Number of processors to use", + equate=False, + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["-l", "--log", "log"], + "Log all non-essential output to this file.", + filename=True, + equate=False, + ), + _Switch(["-h", "--help", "help"], "Print help and exit."), + _Switch(["-v", "--verbose", "verbose"], "Verbose output"), + _Switch(["--version", "version"], "Print version information and exit"), + _Switch( + ["--long-version", "long_version"], + "Print long version information and exit", + ), + _Switch(["--force", "force"], "Force file overwriting."), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/_Clustalw.py b/code/lib/Bio/Align/Applications/_Clustalw.py new file mode 100644 index 0000000..777e411 --- /dev/null +++ b/code/lib/Bio/Align/Applications/_Clustalw.py @@ -0,0 +1,486 @@ +# Copyright 2009 by Cymon J. Cox. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple alignment program Clustal W.""" + + +import os +from Bio.Application import _Option, _Switch, AbstractCommandline + + +class ClustalwCommandline(AbstractCommandline): + """Command line wrapper for clustalw (version one or two). + + http://www.clustal.org/ + + Notes + ----- + Last checked against versions: 1.83 and 2.1 + + References + ---------- + Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA, + McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD, + Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0. + Bioinformatics, 23, 2947-2948. + + Examples + -------- + >>> from Bio.Align.Applications import ClustalwCommandline + >>> in_file = "unaligned.fasta" + >>> clustalw_cline = ClustalwCommandline("clustalw2", infile=in_file) + >>> print(clustalw_cline) + clustalw2 -infile=unaligned.fasta + + You would typically run the command line with clustalw_cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + """ + + # TODO - Should we default to cmd="clustalw2" now? + def __init__(self, cmd="clustalw", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-infile", "-INFILE", "INFILE", "infile"], + "Input sequences.", + filename=True, + ), + _Option( + ["-profile1", "-PROFILE1", "PROFILE1", "profile1"], + "Profiles (old alignment).", + filename=True, + ), + _Option( + ["-profile2", "-PROFILE2", "PROFILE2", "profile2"], + "Profiles (old alignment).", + filename=True, + ), + # ################# VERBS (do things) ############################# + _Switch( + ["-options", "-OPTIONS", "OPTIONS", "options"], + "List the command line parameters", + ), + _Switch( + ["-help", "-HELP", "HELP", "help"], "Outline the command line params." + ), + _Switch( + ["-check", "-CHECK", "CHECK", "check"], + "Outline the command line params.", + ), + _Switch( + ["-fullhelp", "-FULLHELP", "FULLHELP", "fullhelp"], + "Output full help content.", + ), + _Switch( + ["-align", "-ALIGN", "ALIGN", "align"], "Do full multiple alignment." + ), + _Switch(["-tree", "-TREE", "TREE", "tree"], "Calculate NJ tree."), + _Switch( + ["-pim", "-PIM", "PIM", "pim"], + "Output percent identity matrix (while calculating the tree).", + ), + _Option( + ["-bootstrap", "-BOOTSTRAP", "BOOTSTRAP", "bootstrap"], + "Bootstrap a NJ tree (n= number of bootstraps; def. = 1000).", + checker_function=lambda x: isinstance(x, int), + ), + _Switch( + ["-convert", "-CONVERT", "CONVERT", "convert"], + "Output the input sequences in a different file format.", + ), + # #################### PARAMETERS (set things) ######################### + # ***General settings:**** + # Makes no sense in biopython + # _Option(["-interactive", "-INTERACTIVE", "INTERACTIVE", "interactive"], + # [], + # lambda x: 0, # Does not take value + # False, + # "read command line, then enter normal interactive menus", + # False), + _Switch( + ["-quicktree", "-QUICKTREE", "QUICKTREE", "quicktree"], + "Use FAST algorithm for the alignment guide tree", + ), + _Option( + ["-type", "-TYPE", "TYPE", "type"], + "PROTEIN or DNA sequences", + checker_function=lambda x: x in ["PROTEIN", "DNA", "protein", "dna"], + ), + _Switch( + ["-negative", "-NEGATIVE", "NEGATIVE", "negative"], + "Protein alignment with negative values in matrix", + ), + _Option( + ["-outfile", "-OUTFILE", "OUTFILE", "outfile"], + "Output sequence alignment file name", + filename=True, + ), + _Option( + ["-output", "-OUTPUT", "OUTPUT", "output"], + "Output format: CLUSTAL(default), GCG, GDE, PHYLIP, PIR, NEXUS and FASTA", + checker_function=lambda x: x + in [ + "CLUSTAL", + "GCG", + "GDE", + "PHYLIP", + "PIR", + "NEXUS", + "FASTA", + "clustal", + "gcg", + "gde", + "phylip", + "pir", + "nexus", + "fasta", + ], + ), + _Option( + ["-outorder", "-OUTORDER", "OUTORDER", "outorder"], + "Output taxon order: INPUT or ALIGNED", + checker_function=lambda x: x + in ["INPUT", "input", "ALIGNED", "aligned"], + ), + _Option( + ["-case", "-CASE", "CASE", "case"], + "LOWER or UPPER (for GDE output only)", + checker_function=lambda x: x in ["UPPER", "upper", "LOWER", "lower"], + ), + _Option( + ["-seqnos", "-SEQNOS", "SEQNOS", "seqnos"], + "OFF or ON (for Clustal output only)", + checker_function=lambda x: x in ["ON", "on", "OFF", "off"], + ), + _Option( + ["-seqno_range", "-SEQNO_RANGE", "SEQNO_RANGE", "seqno_range"], + "OFF or ON (NEW- for all output formats)", + checker_function=lambda x: x in ["ON", "on", "OFF", "off"], + ), + _Option( + ["-range", "-RANGE", "RANGE", "range"], + "Sequence range to write starting m to m+n. " + "Input as string eg. '24,200'", + ), + _Option( + ["-maxseqlen", "-MAXSEQLEN", "MAXSEQLEN", "maxseqlen"], + "Maximum allowed input sequence length", + checker_function=lambda x: isinstance(x, int), + ), + _Switch( + ["-quiet", "-QUIET", "QUIET", "quiet"], + "Reduce console output to minimum", + ), + _Option( + ["-stats", "-STATS", "STATS", "stats"], + "Log some alignment statistics to file", + filename=True, + ), + # ***Fast Pairwise Alignments:*** + _Option( + ["-ktuple", "-KTUPLE", "KTUPLE", "ktuple"], + "Word size", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-topdiags", "-TOPDIAGS", "TOPDIAGS", "topdiags"], + "Number of best diags.", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-window", "-WINDOW", "WINDOW", "window"], + "Window around best diags.", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-pairgap", "-PAIRGAP", "PAIRGAP", "pairgap"], + "Gap penalty", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-score", "-SCORE", "SCORE", "score"], + "Either: PERCENT or ABSOLUTE", + checker_function=lambda x: x + in ["percent", "PERCENT", "absolute", "ABSOLUTE"], + ), + # ***Slow Pairwise Alignments:*** + _Option( + ["-pwmatrix", "-PWMATRIX", "PWMATRIX", "pwmatrix"], + "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename", + checker_function=lambda x: ( + x + in [ + "BLOSUM", + "PAM", + "GONNET", + "ID", + "blosum", + "pam", + "gonnet", + "id", + ] + or os.path.exists(x) + ), + filename=True, + ), + _Option( + ["-pwdnamatrix", "-PWDNAMATRIX", "PWDNAMATRIX", "pwdnamatrix"], + "DNA weight matrix=IUB, CLUSTALW or filename", + checker_function=lambda x: ( + x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x) + ), + filename=True, + ), + _Option( + ["-pwgapopen", "-PWGAPOPEN", "PWGAPOPEN", "pwgapopen"], + "Gap opening penalty", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-pwgapext", "-PWGAPEXT", "PWGAPEXT", "pwgapext"], + "Gap extension penalty", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + # ***Multiple Alignments:*** + _Option( + ["-newtree", "-NEWTREE", "NEWTREE", "newtree"], + "Output file name for newly created guide tree", + filename=True, + ), + _Option( + ["-usetree", "-USETREE", "USETREE", "usetree"], + "File name of guide tree", + checker_function=lambda x: os.path.exists, + filename=True, + ), + _Option( + ["-matrix", "-MATRIX", "MATRIX", "matrix"], + "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename", + checker_function=lambda x: ( + x + in [ + "BLOSUM", + "PAM", + "GONNET", + "ID", + "blosum", + "pam", + "gonnet", + "id", + ] + or os.path.exists(x) + ), + filename=True, + ), + _Option( + ["-dnamatrix", "-DNAMATRIX", "DNAMATRIX", "dnamatrix"], + "DNA weight matrix=IUB, CLUSTALW or filename", + checker_function=lambda x: ( + x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x) + ), + filename=True, + ), + _Option( + ["-gapopen", "-GAPOPEN", "GAPOPEN", "gapopen"], + "Gap opening penalty", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-gapext", "-GAPEXT", "GAPEXT", "gapext"], + "Gap extension penalty", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Switch( + ["-endgaps", "-ENDGAPS", "ENDGAPS", "endgaps"], + "No end gap separation pen.", + ), + _Option( + ["-gapdist", "-GAPDIST", "GAPDIST", "gapdist"], + "Gap separation pen. range", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Switch( + ["-nopgap", "-NOPGAP", "NOPGAP", "nopgap"], "Residue-specific gaps off" + ), + _Switch(["-nohgap", "-NOHGAP", "NOHGAP", "nohgap"], "Hydrophilic gaps off"), + _Switch( + ["-hgapresidues", "-HGAPRESIDUES", "HGAPRESIDUES", "hgapresidues"], + "List hydrophilic res.", + ), + _Option( + ["-maxdiv", "-MAXDIV", "MAXDIV", "maxdiv"], + "% ident. for delay", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + # Already handled in General Settings section, but appears a second + # time under Multiple Alignments in the help + # _Option(["-type", "-TYPE", "TYPE", "type"], + # "PROTEIN or DNA", + # checker_function=lambda x: x in ["PROTEIN", "DNA", + # "protein", "dna"]), + _Option( + ["-transweight", "-TRANSWEIGHT", "TRANSWEIGHT", "transweight"], + "Transitions weighting", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-iteration", "-ITERATION", "ITERATION", "iteration"], + "NONE or TREE or ALIGNMENT", + checker_function=lambda x: x + in ["NONE", "TREE", "ALIGNMENT", "none", "tree", "alignment"], + ), + _Option( + ["-numiter", "-NUMITER", "NUMITER", "numiter"], + "maximum number of iterations to perform", + checker_function=lambda x: isinstance(x, int), + ), + _Switch( + ["-noweights", "-NOWEIGHTS", "NOWEIGHTS", "noweights"], + "Disable sequence weighting", + ), + # ***Profile Alignments:*** + _Switch( + ["-profile", "-PROFILE", "PROFILE", "profile"], + "Merge two alignments by profile alignment", + ), + _Option( + ["-newtree1", "-NEWTREE1", "NEWTREE1", "newtree1"], + "Output file name for new guide tree of profile1", + filename=True, + ), + _Option( + ["-newtree2", "-NEWTREE2", "NEWTREE2", "newtree2"], + "Output file for new guide tree of profile2", + filename=True, + ), + _Option( + ["-usetree1", "-USETREE1", "USETREE1", "usetree1"], + "File name of guide tree for profile1", + checker_function=lambda x: os.path.exists, + filename=True, + ), + _Option( + ["-usetree2", "-USETREE2", "USETREE2", "usetree2"], + "File name of guide tree for profile2", + checker_function=lambda x: os.path.exists, + filename=True, + ), + # ***Sequence to Profile Alignments:*** + _Switch( + ["-sequences", "-SEQUENCES", "SEQUENCES", "sequences"], + "Sequentially add profile2 sequences to profile1 alignment", + ), + # These are already handled in the Multiple Alignments section, + # but appear a second time here in the help. + # _Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"], + # "File for new guide tree", + # filename=True), + # _Option(["-usetree", "-USETREE", "USETREE", "usetree"], + # "File for old guide tree", + # checker_function=lambda x: os.path.exists, + # filename=True), + # ***Structure Alignments:*** + _Switch( + ["-nosecstr1", "-NOSECSTR1", "NOSECSTR1", "nosecstr1"], + "Do not use secondary structure-gap penalty mask for profile 1", + ), + _Switch( + ["-nosecstr2", "-NOSECSTR2", "NOSECSTR2", "nosecstr2"], + "Do not use secondary structure-gap penalty mask for profile 2", + ), + _Option( + ["-secstrout", "-SECSTROUT", "SECSTROUT", "secstrout"], + "STRUCTURE or MASK or BOTH or NONE output in alignment file", + checker_function=lambda x: x + in [ + "STRUCTURE", + "MASK", + "BOTH", + "NONE", + "structure", + "mask", + "both", + "none", + ], + ), + _Option( + ["-helixgap", "-HELIXGAP", "HELIXGAP", "helixgap"], + "Gap penalty for helix core residues", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-strandgap", "-STRANDGAP", "STRANDGAP", "strandgap"], + "gap penalty for strand core residues", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-loopgap", "-LOOPGAP", "LOOPGAP", "loopgap"], + "Gap penalty for loop regions", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-terminalgap", "-TERMINALGAP", "TERMINALGAP", "terminalgap"], + "Gap penalty for structure termini", + checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)), + ), + _Option( + ["-helixendin", "-HELIXENDIN", "HELIXENDIN", "helixendin"], + "Number of residues inside helix to be treated as terminal", + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["-helixendout", "-HELIXENDOUT", "HELIXENDOUT", "helixendout"], + "Number of residues outside helix to be treated as terminal", + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["-strandendin", "-STRANDENDIN", "STRANDENDIN", "strandendin"], + "Number of residues inside strand to be treated as terminal", + checker_function=lambda x: isinstance(x, int), + ), + _Option( + ["-strandendout", "-STRANDENDOUT", "STRANDENDOUT", "strandendout"], + "Number of residues outside strand to be treated as terminal", + checker_function=lambda x: isinstance(x, int), + ), + # ***Trees:*** + _Option( + ["-outputtree", "-OUTPUTTREE", "OUTPUTTREE", "outputtree"], + "nj OR phylip OR dist OR nexus", + checker_function=lambda x: x + in ["NJ", "PHYLIP", "DIST", "NEXUS", "nj", "phylip", "dist", "nexus"], + ), + _Option( + ["-seed", "-SEED", "SEED", "seed"], + "Seed number for bootstraps.", + checker_function=lambda x: isinstance(x, int), + ), + _Switch( + ["-kimura", "-KIMURA", "KIMURA", "kimura"], "Use Kimura's correction." + ), + _Switch( + ["-tossgaps", "-TOSSGAPS", "TOSSGAPS", "tossgaps"], + "Ignore positions with gaps.", + ), + _Option( + ["-bootlabels", "-BOOTLABELS", "BOOTLABELS", "bootlabels"], + "Node OR branch position of bootstrap values in tree display", + checker_function=lambda x: x in ["NODE", "BRANCH", "node", "branch"], + ), + _Option( + ["-clustering", "-CLUSTERING", "CLUSTERING", "clustering"], + "NJ or UPGMA", + checker_function=lambda x: x in ["NJ", "UPGMA", "nj", "upgma"], + ), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/_Dialign.py b/code/lib/Bio/Align/Applications/_Dialign.py new file mode 100644 index 0000000..52be1b1 --- /dev/null +++ b/code/lib/Bio/Align/Applications/_Dialign.py @@ -0,0 +1,243 @@ +# Copyright 2009 by Cymon J. Cox. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple alignment program DIALIGN2-2.""" + +from Bio.Application import _Option, _Argument, _Switch, AbstractCommandline + + +class DialignCommandline(AbstractCommandline): + """Command line wrapper for the multiple alignment program DIALIGN2-2. + + http://bibiserv.techfak.uni-bielefeld.de/dialign/welcome.html + + Notes + ----- + Last checked against version: 2.2 + + References + ---------- + B. Morgenstern (2004). DIALIGN: Multiple DNA and Protein Sequence + Alignment at BiBiServ. Nucleic Acids Research 32, W33-W36. + + Examples + -------- + To align a FASTA file (unaligned.fasta) with the output files names + aligned.* including a FASTA output file (aligned.fa), use: + + >>> from Bio.Align.Applications import DialignCommandline + >>> dialign_cline = DialignCommandline(input="unaligned.fasta", + ... fn="aligned", fa=True) + >>> print(dialign_cline) + dialign2-2 -fa -fn aligned unaligned.fasta + + You would typically run the command line with dialign_cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + """ + + def __init__(self, cmd="dialign2-2", **kwargs): + """Initialize the class.""" + self.program_name = cmd + self.parameters = [ + _Switch( + ["-afc", "afc"], + r"Creates additional output file '\*.afc' " + "containing data of all fragments considered " + "for alignment WARNING: this file can be HUGE !", + ), + _Switch( + ["-afc_v", "afc_v"], + "Like '-afc' but verbose: fragments are explicitly " + "printed. WARNING: this file can be EVEN BIGGER !", + ), + _Switch( + ["-anc", "anc"], + "Anchored alignment. Requires a file .anc " + "containing anchor points.", + ), + _Switch( + ["-cs", "cs"], + "If segments are translated, not only the 'Watson " + "strand' but also the 'Crick strand' is looked at.", + ), + _Switch(["-cw", "cw"], "Additional output file in CLUSTAL W format."), + _Switch( + ["-ds", "ds"], + "'dna alignment speed up' - non-translated nucleic acid " + "fragments are taken into account only if they start " + "with at least two matches. Speeds up DNA alignment at " + "the expense of sensitivity.", + ), + _Switch(["-fa", "fa"], "Additional output file in FASTA format."), + _Switch( + ["-ff", "ff"], + r"Creates file \*.frg containing information about all " + "fragments that are part of the respective optimal " + "pairwise alignmnets plus information about " + "consistency in the multiple alignment", + ), + _Option( + ["-fn", "fn"], + "Output files are named ..", + equate=False, + ), + _Switch( + ["-fop", "fop"], + r"Creates file \*.fop containing coordinates of all " + "fragments that are part of the respective pairwise alignments.", + ), + _Switch( + ["-fsm", "fsm"], + r"Creates file \*.fsm containing coordinates of all " + "fragments that are part of the final alignment", + ), + _Switch( + ["-iw", "iw"], + "Overlap weights switched off (by default, overlap " + "weights are used if up to 35 sequences are aligned). " + "This option speeds up the alignment but may lead " + "to reduced alignment quality.", + ), + _Switch( + ["-lgs", "lgs"], + "'long genomic sequences' - combines the following " + "options: -ma, -thr 2, -lmax 30, -smin 8, -nta, -ff, " + "-fop, -ff, -cs, -ds, -pst ", + ), + _Switch( + ["-lgs_t", "lgs_t"], + "Like '-lgs' but with all segment pairs assessed " + "at the peptide level (rather than 'mixed alignments' " + "as with the '-lgs' option). Therefore faster than " + "-lgs but not very sensitive for non-coding regions.", + ), + _Option( + ["-lmax", "lmax"], + "Maximum fragment length = x (default: x = 40 or " + "x = 120 for 'translated' fragments). Shorter x " + "speeds up the program but may affect alignment quality.", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + _Switch( + ["-lo", "lo"], + r"(Long Output) Additional file \*.log with information " + "about fragments selected for pairwise alignment and " + "about consistency in multi-alignment procedure.", + ), + _Switch( + ["-ma", "ma"], + "'mixed alignments' consisting of P-fragments and " + "N-fragments if nucleic acid sequences are aligned.", + ), + _Switch( + ["-mask", "mask"], + "Residues not belonging to selected fragments are " + r"replaced by '\*' characters in output alignment " + "(rather than being printed in lower-case characters)", + ), + _Switch( + ["-mat", "mat"], + r"Creates file \*mat with substitution counts derived " + "from the fragments that have been selected for alignment.", + ), + _Switch( + ["-mat_thr", "mat_thr"], + "Like '-mat' but only fragments with weight score " + "> t are considered", + ), + _Switch( + ["-max_link", "max_link"], + "'maximum linkage' clustering used to construct " + "sequence tree (instead of UPGMA).", + ), + _Switch(["-min_link", "min_link"], "'minimum linkage' clustering used."), + _Option(["-mot", "mot"], "'motif' option.", equate=False), + _Switch(["-msf", "msf"], "Separate output file in MSF format."), + _Switch( + ["-n", "n"], + "Input sequences are nucleic acid sequences. " + "No translation of fragments.", + ), + _Switch( + ["-nt", "nt"], + "Input sequences are nucleic acid sequences and " + "'nucleic acid segments' are translated to 'peptide " + "segments'.", + ), + _Switch( + ["-nta", "nta"], + "'no textual alignment' - textual alignment suppressed. " + "This option makes sense if other output files are of " + "interest -- e.g. the fragment files created with -ff, " + "-fop, -fsm or -lo.", + ), + _Switch( + ["-o", "o"], + "Fast version, resulting alignments may be slightly different.", + ), + _Switch( + ["-ow", "ow"], + "Overlap weights enforced (By default, overlap weights " + "are used only if up to 35 sequences are aligned since " + "calculating overlap weights is time consuming).", + ), + _Switch( + ["-pst", "pst"], + r"'print status'. Creates and updates a file \*.sta with " + "information about the current status of the program " + "run. This option is recommended if large data sets " + "are aligned since it allows the user to estimate the " + "remaining running time.", + ), + _Switch( + ["-smin", "smin"], + "Minimum similarity value for first residue pair " + "(or codon pair) in fragments. Speeds up protein " + "alignment or alignment of translated DNA fragments " + "at the expense of sensitivity.", + ), + _Option( + ["-stars", "stars"], + r"Maximum number of '\*' characters indicating degree " + "of local similarity among sequences. By default, no " + "stars are used but numbers between 0 and 9, instead.", + checker_function=lambda x: x in range(0, 10), + equate=False, + ), + _Switch(["-stdo", "stdo"], "Results written to standard output."), + _Switch( + ["-ta", "ta"], + "Standard textual alignment printed (overrides " + "suppression of textual alignments in special " + "options, e.g. -lgs)", + ), + _Option( + ["-thr", "thr"], + "Threshold T = x.", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + _Switch( + ["-xfr", "xfr"], + "'exclude fragments' - list of fragments can be " + "specified that are NOT considered for pairwise alignment", + ), + _Argument( + ["input"], + "Input file name. Must be FASTA format", + filename=True, + is_required=True, + ), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/_MSAProbs.py b/code/lib/Bio/Align/Applications/_MSAProbs.py new file mode 100644 index 0000000..74b26a1 --- /dev/null +++ b/code/lib/Bio/Align/Applications/_MSAProbs.py @@ -0,0 +1,89 @@ +# Copyright 2013 by Christian Brueffer. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple sequence alignment program MSAProbs.""" + +from Bio.Application import _Argument, _Option, _Switch, AbstractCommandline + + +class MSAProbsCommandline(AbstractCommandline): + """Command line wrapper for MSAProbs. + + http://msaprobs.sourceforge.net + + Notes + ----- + Last checked against version: 0.9.7 + + References + ---------- + Yongchao Liu, Bertil Schmidt, Douglas L. Maskell: "MSAProbs: multiple + sequence alignment based on pair hidden Markov models and partition + function posterior probabilities". Bioinformatics, 2010, 26(16): 1958 -1964 + + Examples + -------- + >>> from Bio.Align.Applications import MSAProbsCommandline + >>> in_file = "unaligned.fasta" + >>> out_file = "aligned.cla" + >>> cline = MSAProbsCommandline(infile=in_file, outfile=out_file, clustalw=True) + >>> print(cline) + msaprobs -o aligned.cla -clustalw unaligned.fasta + + You would typically run the command line with cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + """ + + def __init__(self, cmd="msaprobs", **kwargs): + """Initialize the class.""" + # order of parameters is the same as in msaprobs -help + self.parameters = [ + _Option( + ["-o", "--outfile", "outfile"], + "specify the output file name (STDOUT by default)", + filename=True, + equate=False, + ), + _Option( + ["-num_threads", "numthreads"], + "specify the number of threads used, and otherwise detect automatically", + checker_function=lambda x: isinstance(x, int), + ), + _Switch( + ["-clustalw", "clustalw"], + "use CLUSTALW output format instead of FASTA format", + ), + _Option( + ["-c", "consistency"], + "use 0 <= REPS <= 5 (default: 2) passes of consistency transformation", + checker_function=lambda x: isinstance(x, int) and 0 <= x <= 5, + ), + _Option( + ["-ir", "--iterative-refinement", "iterative_refinement"], + "use 0 <= REPS <= 1000 (default: 10) passes of iterative-refinement", + checker_function=lambda x: isinstance(x, int) and 0 <= x <= 1000, + ), + _Switch(["-v", "verbose"], "report progress while aligning (default: off)"), + _Option( + ["-annot", "annot"], + "write annotation for multiple alignment to FILENAME", + filename=True, + ), + _Switch( + ["-a", "--alignment-order", "alignment_order"], + "print sequences in alignment order rather than input order (default: off)", + ), + _Option(["-version", "version"], "print out version of MSAPROBS"), + _Argument(["infile"], "Multiple sequence input file", filename=True), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/_Mafft.py b/code/lib/Bio/Align/Applications/_Mafft.py new file mode 100644 index 0000000..4a0b901 --- /dev/null +++ b/code/lib/Bio/Align/Applications/_Mafft.py @@ -0,0 +1,435 @@ +# Copyright 2009 by Cymon J. Cox. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple alignment programme MAFFT.""" + + +from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline + + +class MafftCommandline(AbstractCommandline): + """Command line wrapper for the multiple alignment program MAFFT. + + http://align.bmr.kyushu-u.ac.jp/mafft/software/ + + Notes + ----- + Last checked against version: MAFFT v6.717b (2009/12/03) + + References + ---------- + Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of + multiple ncRNA alignment by incorporating structural information into + a MAFFT-based framework (describes RNA structural alignment methods) + + Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent + developments in the MAFFT multiple sequence alignment program + (outlines version 6) + + Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an + algorithm to build an approximate tree from a large number of + unaligned sequences (describes the PartTree algorithm) + + Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT + version 5: improvement in accuracy of multiple sequence alignment + (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i + strategies) + + Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) + + Examples + -------- + >>> from Bio.Align.Applications import MafftCommandline + >>> mafft_exe = "/opt/local/mafft" + >>> in_file = "../Doc/examples/opuntia.fasta" + >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) + >>> print(mafft_cline) + /opt/local/mafft ../Doc/examples/opuntia.fasta + + If the mafft binary is on the path (typically the case on a Unix style + operating system) then you don't need to supply the executable location: + + >>> from Bio.Align.Applications import MafftCommandline + >>> in_file = "../Doc/examples/opuntia.fasta" + >>> mafft_cline = MafftCommandline(input=in_file) + >>> print(mafft_cline) + mafft ../Doc/examples/opuntia.fasta + + You would typically run the command line with mafft_cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + Note that MAFFT will write the alignment to stdout, which you may + want to save to a file and then parse, e.g.:: + + stdout, stderr = mafft_cline() + with open("aligned.fasta", "w") as handle: + handle.write(stdout) + from Bio import AlignIO + align = AlignIO.read("aligned.fasta", "fasta") + + Alternatively, to parse the output with AlignIO directly you can + use StringIO to turn the string into a handle:: + + stdout, stderr = mafft_cline() + from io import StringIO + from Bio import AlignIO + align = AlignIO.read(StringIO(stdout), "fasta") + + """ + + def __init__(self, cmd="mafft", **kwargs): + """Initialize the class.""" + BLOSUM_MATRICES = ["30", "45", "62", "80"] + self.parameters = [ + # **** Algorithm **** + # Automatically selects an appropriate strategy from L-INS-i, FFT-NS- + # i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) + _Switch(["--auto", "auto"], "Automatically select strategy. Default off."), + # Distance is calculated based on the number of shared 6mers. Default: on + _Switch( + ["--6merpair", "6merpair", "sixmerpair"], + "Distance is calculated based on the number of shared " + "6mers. Default: on", + ), + # All pairwise alignments are computed with the Needleman-Wunsch + # algorithm. More accurate but slower than --6merpair. Suitable for a + # set of globally alignable sequences. Applicable to up to ~200 + # sequences. A combination with --maxiterate 1000 is recommended (G- + # INS-i). Default: off (6mer distance is used) + _Switch( + ["--globalpair", "globalpair"], + "All pairwise alignments are computed with the " + "Needleman-Wunsch algorithm. Default: off", + ), + # All pairwise alignments are computed with the Smith-Waterman + # algorithm. More accurate but slower than --6merpair. Suitable for a + # set of locally alignable sequences. Applicable to up to ~200 + # sequences. A combination with --maxiterate 1000 is recommended (L- + # INS-i). Default: off (6mer distance is used) + _Switch( + ["--localpair", "localpair"], + "All pairwise alignments are computed with the " + "Smith-Waterman algorithm. Default: off", + ), + # All pairwise alignments are computed with a local algorithm with + # the generalized affine gap cost (Altschul 1998). More accurate but + # slower than --6merpair. Suitable when large internal gaps are + # expected. Applicable to up to ~200 sequences. A combination with -- + # maxiterate 1000 is recommended (E-INS-i). Default: off (6mer + # distance is used) + _Switch( + ["--genafpair", "genafpair"], + "All pairwise alignments are computed with a local " + "algorithm with the generalized affine gap cost " + "(Altschul 1998). Default: off", + ), + # All pairwise alignments are computed with FASTA (Pearson and Lipman + # 1988). FASTA is required. Default: off (6mer distance is used) + _Switch( + ["--fastapair", "fastapair"], + "All pairwise alignments are computed with FASTA " + "(Pearson and Lipman 1988). Default: off", + ), + # Weighting factor for the consistency term calculated from pairwise + # alignments. Valid when either of --blobalpair, --localpair, -- + # genafpair, --fastapair or --blastpair is selected. Default: 2.7 + _Option( + ["--weighti", "weighti"], + "Weighting factor for the consistency term calculated " + "from pairwise alignments. Default: 2.7", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # Guide tree is built number times in the progressive stage. Valid + # with 6mer distance. Default: 2 + _Option( + ["--retree", "retree"], + "Guide tree is built number times in the progressive " + "stage. Valid with 6mer distance. Default: 2", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # Number cycles of iterative refinement are performed. Default: 0 + _Option( + ["--maxiterate", "maxiterate"], + "Number cycles of iterative refinement are performed. Default: 0", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # Number of threads to use. Default: 1 + _Option( + ["--thread", "thread"], + "Number of threads to use. Default: 1", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # Use FFT approximation in group-to-group alignment. Default: on + _Switch( + ["--fft", "fft"], + "Use FFT approximation in group-to-group alignment. Default: on", + ), + # Do not use FFT approximation in group-to-group alignment. Default: + # off + _Switch( + ["--nofft", "nofft"], + "Do not use FFT approximation in group-to-group " + "alignment. Default: off", + ), + # Alignment score is not checked in the iterative refinement stage. + # Default: off (score is checked) + _Switch( + ["--noscore", "noscore"], + "Alignment score is not checked in the iterative " + "refinement stage. Default: off (score is checked)", + ), + # Use the Myers-Miller (1988) algorithm. Default: automatically + # turned on when the alignment length exceeds 10,000 (aa/nt). + _Switch( + ["--memsave", "memsave"], + "Use the Myers-Miller (1988) algorithm. Default: " + "automatically turned on when the alignment length " + "exceeds 10,000 (aa/nt).", + ), + # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with + # the 6mer distance. Recommended for a large number (> ~10,000) of + # sequences are input. Default: off + _Switch( + ["--parttree", "parttree"], + "Use a fast tree-building method with the 6mer " + "distance. Default: off", + ), + # The PartTree algorithm is used with distances based on DP. Slightly + # more accurate and slower than --parttree. Recommended for a large + # number (> ~10,000) of sequences are input. Default: off + _Switch( + ["--dpparttree", "dpparttree"], + "The PartTree algorithm is used with distances " + "based on DP. Default: off", + ), + # The PartTree algorithm is used with distances based on FASTA. + # Slightly more accurate and slower than --parttree. Recommended for + # a large number (> ~10,000) of sequences are input. FASTA is + # required. Default: off + _Switch( + ["--fastaparttree", "fastaparttree"], + "The PartTree algorithm is used with distances based " + "on FASTA. Default: off", + ), + # The number of partitions in the PartTree algorithm. Default: 50 + _Option( + ["--partsize", "partsize"], + "The number of partitions in the PartTree algorithm. Default: 50", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # Do not make alignment larger than number sequences. Valid only with + # the --*parttree options. Default: the number of input sequences + _Switch( + ["--groupsize", "groupsize"], + "Do not make alignment larger than number sequences. " + "Default: the number of input sequences", + ), + # Adjust direction according to the first sequence + # Mafft V6 beta function + _Switch( + ["--adjustdirection", "adjustdirection"], + "Adjust direction according to the first sequence. Default off.", + ), + # Adjust direction according to the first sequence + # for highly diverged data; very slow + # Mafft V6 beta function + _Switch( + ["--adjustdirectionaccurately", "adjustdirectionaccurately"], + "Adjust direction according to the first sequence," + "for highly diverged data; very slow" + "Default off.", + ), + # **** Parameter **** + # Gap opening penalty at group-to-group alignment. Default: 1.53 + _Option( + ["--op", "op"], + "Gap opening penalty at group-to-group alignment. Default: 1.53", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # Offset value, which works like gap extension penalty, for group-to- + # group alignment. Deafult: 0.123 + _Option( + ["--ep", "ep"], + "Offset value, which works like gap extension penalty, " + "for group-to- group alignment. Default: 0.123", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # Gap opening penalty at local pairwise alignment. Valid when the -- + # localpair or --genafpair option is selected. Default: -2.00 + _Option( + ["--lop", "lop"], + "Gap opening penalty at local pairwise alignment. Default: 0.123", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # Offset value at local pairwise alignment. Valid when the -- + # localpair or --genafpair option is selected. Default: 0.1 + _Option( + ["--lep", "lep"], + "Offset value at local pairwise alignment. Default: 0.1", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # Gap extension penalty at local pairwise alignment. Valid when the - + # -localpair or --genafpair option is selected. Default: -0.1 + _Option( + ["--lexp", "lexp"], + "Gap extension penalty at local pairwise alignment. Default: -0.1", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # Gap opening penalty to skip the alignment. Valid when the -- + # genafpair option is selected. Default: -6.00 + _Option( + ["--LOP", "LOP"], + "Gap opening penalty to skip the alignment. Default: -6.00", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # Gap extension penalty to skip the alignment. Valid when the -- + # genafpair option is selected. Default: 0.00 + _Option( + ["--LEXP", "LEXP"], + "Gap extension penalty to skip the alignment. Default: 0.00", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. + # number=30, 45, 62 or 80. Default: 62 + _Option( + ["--bl", "bl"], + "BLOSUM number matrix is used. Default: 62", + checker_function=lambda x: x in BLOSUM_MATRICES, + equate=False, + ), + # JTT PAM number (Jones et al. 1992) matrix is used. number>0. + # Default: BLOSUM62 + _Option( + ["--jtt", "jtt"], + "JTT PAM number (Jones et al. 1992) matrix is used. " + "number>0. Default: BLOSUM62", + equate=False, + ), + # Transmembrane PAM number (Jones et al. 1994) matrix is used. + # number>0. Default: BLOSUM62 + _Option( + ["--tm", "tm"], + "Transmembrane PAM number (Jones et al. 1994) " + "matrix is used. number>0. Default: BLOSUM62", + filename=True, # to ensure spaced inputs are quoted + equate=False, + ), + # Use a user-defined AA scoring matrix. The format of matrixfile is + # the same to that of BLAST. Ignored when nucleotide sequences are + # input. Default: BLOSUM62 + _Option( + ["--aamatrix", "aamatrix"], + "Use a user-defined AA scoring matrix. Default: BLOSUM62", + filename=True, # to ensure spaced inputs are quoted + equate=False, + ), + # Incorporate the AA/nuc composition information into the scoring + # matrix. Default: off + _Switch( + ["--fmodel", "fmodel"], + "Incorporate the AA/nuc composition information into " + "the scoring matrix (True) or not (False, default)", + ), + # **** Output **** + # Name length for CLUSTAL and PHYLIP format output + _Option( + ["--namelength", "namelength"], + """Name length in CLUSTAL and PHYLIP output. + + MAFFT v6.847 (2011) added --namelength for use with + the --clustalout option for CLUSTAL output. + + MAFFT v7.024 (2013) added support for this with the + --phylipout option for PHYLIP output (default 10). + """, + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # Output format: clustal format. Default: off (fasta format) + _Switch( + ["--clustalout", "clustalout"], + "Output format: clustal (True) or fasta (False, default)", + ), + # Output format: phylip format. + # Added in beta with v6.847, fixed in v6.850 (2011) + _Switch( + ["--phylipout", "phylipout"], + "Output format: phylip (True), or fasta (False, default)", + ), + # Output order: same as input. Default: on + _Switch( + ["--inputorder", "inputorder"], + "Output order: same as input (True, default) or alignment " + "based (False)", + ), + # Output order: aligned. Default: off (inputorder) + _Switch( + ["--reorder", "reorder"], + "Output order: aligned (True) or in input order (False, default)", + ), + # Guide tree is output to the input.tree file. Default: off + _Switch( + ["--treeout", "treeout"], + "Guide tree is output to the input.tree file (True) or " + "not (False, default)", + ), + # Do not report progress. Default: off + _Switch( + ["--quiet", "quiet"], + "Do not report progress (True) or not (False, default).", + ), + # **** Input **** + # Assume the sequences are nucleotide. Deafult: auto + _Switch( + ["--nuc", "nuc"], + "Assume the sequences are nucleotide (True/False). Default: auto", + ), + # Assume the sequences are amino acid. Deafult: auto + _Switch( + ["--amino", "amino"], + "Assume the sequences are amino acid (True/False). Default: auto", + ), + # MAFFT has multiple --seed commands where the unaligned input is + # aligned to the seed alignment. There can be multiple seeds in the + # form: "mafft --seed align1 --seed align2 [etc] input" + # Effectively for n number of seed alignments. + # TODO - Can we use class _ArgumentList here? + _Option( + ["--seed", "seed"], + "Seed alignments given in alignment_n (fasta format) " + "are aligned with sequences in input.", + filename=True, + equate=False, + ), + # The input (must be FASTA format) + _Argument(["input"], "Input file name", filename=True, is_required=True), + # mafft-profile takes a second alignment input as an argument: + # mafft-profile align1 align2 + _Argument( + ["input1"], + "Second input file name for the mafft-profile command", + filename=True, + ), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/_Muscle.py b/code/lib/Bio/Align/Applications/_Muscle.py new file mode 100644 index 0000000..6a67e2a --- /dev/null +++ b/code/lib/Bio/Align/Applications/_Muscle.py @@ -0,0 +1,685 @@ +# Copyright 2009 by Cymon J. Cox. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple alignment program MUSCLE.""" + + +from Bio.Application import _Option, _Switch, AbstractCommandline + + +class MuscleCommandline(AbstractCommandline): + r"""Command line wrapper for the multiple alignment program MUSCLE. + + http://www.drive5.com/muscle/ + + Notes + ----- + Last checked against version: 3.7, briefly against 3.8 + + References + ---------- + Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high + accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. + + Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with + reduced time and space complexity. BMC Bioinformatics 5(1): 113. + + Examples + -------- + >>> from Bio.Align.Applications import MuscleCommandline + >>> muscle_exe = r"C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" + >>> in_file = r"C:\My Documents\unaligned.fasta" + >>> out_file = r"C:\My Documents\aligned.fasta" + >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) + >>> print(muscle_cline) + "C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" + + You would typically run the command line with muscle_cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + """ + + def __init__(self, cmd="muscle", **kwargs): + """Initialize the class.""" + CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] + DISTANCE_MEASURES_ITER1 = [ + "kmer6_6", + "kmer20_3", + "kmer20_4", + "kbit20_3", + "kmer4_6", + ] + DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + [ + "pctid_kimura", + "pctid_log", + ] + OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] + TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] + + # The mucleotide arguments for the sequence type parameter in MUSCLE (-seqtype) + # were updated at somepoint in MUSCLE version 3.8. Prior to the update + # 'nucleo' was used for nucleotide. This has been updated to 'rna' and 'dna'. 'nucleo' kept for + # backwards compatibility with older MUSCLE versions. + SEQUENCE_TYPES = ["protein", "rna", "dna", "nucleo", "auto"] + WEIGHTING_SCHEMES = [ + "none", + "clustalw", + "henikoff", + "henikoffpb", + "gsc", + "threeway", + ] + self.parameters = [ + # Can't use "in" as the final alias as this + # is a reserved word in python: + _Option( + ["-in", "in", "input"], "Input filename", filename=True, equate=False + ), + _Option(["-out", "out"], "Output filename", filename=True, equate=False), + _Switch( + ["-diags", "diags"], "Find diagonals (faster for similar sequences)" + ), + _Switch(["-profile", "profile"], "Perform a profile alignment"), + _Option( + ["-in1", "in1"], + "First input filename for profile alignment", + filename=True, + equate=False, + ), + _Option( + ["-in2", "in2"], + "Second input filename for a profile alignment", + filename=True, + equate=False, + ), + # anchorspacing Integer 32 Minimum spacing + # between anchor cols + _Option( + ["-anchorspacing", "anchorspacing"], + "Minimum spacing between anchor columns", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # center Floating point [1] Center parameter. + # Should be negative. + _Option( + ["-center", "center"], + "Center parameter - should be negative", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # cluster1 upgma upgmb Clustering method. + _Option( + ["-cluster1", "cluster1"], + "Clustering method used in iteration 1", + checker_function=lambda x: x in CLUSTERING_ALGORITHMS, + equate=False, + ), + # cluster2 upgmb cluster1 is used + # neighborjoining in iteration 1 and + # 2, cluster2 in + # later iterations. + _Option( + ["-cluster2", "cluster2"], + "Clustering method used in iteration 2", + checker_function=lambda x: x in CLUSTERING_ALGORITHMS, + equate=False, + ), + # diaglength Integer 24 Minimum length of + # diagonal. + _Option( + ["-diaglength", "diaglength"], + "Minimum length of diagonal", + checker_function=lambda x: isinstance(x, int), + equate=True, + ), + # diagmargin Integer 5 Discard this many + # positions at ends + # of diagonal. + _Option( + ["-diagmargin", "diagmargin"], + "Discard this many positions at ends of diagonal", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # distance1 kmer6_6 Kmer6_6(amino) or Distance measure + # kmer20_3 Kmer4_6(nucleo) for iteration 1 + # kmer20_4 + # kbit20_3 + # kmer4_6 + _Option( + ["-distance1", "distance1"], + "Distance measure for iteration 1", + checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, + equate=False, + ), + # distance2 kmer6_6 pctid_kimura Distance measure + # kmer20_3 for iterations + # kmer20_4 2, 3 ... + # kbit20_3 + # pctid_kimura + # pctid_log + _Option( + ["-distance2", "distance2"], + "Distance measure for iteration 2", + checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, + equate=False, + ), + # gapextend Floating point [1] The gap extend score + _Option( + ["-gapextend", "gapextend"], + "Gap extension penalty", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # gapopen Floating point [1] The gap open score + # Must be negative. + _Option( + ["-gapopen", "gapopen"], + "Gap open score - negative number", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # hydro Integer 5 Window size for + # determining whether + # a region is + # hydrophobic. + _Option( + ["-hydro", "hydro"], + "Window size for hydrophobic region", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # hydrofactor Floating point 1.2 Multiplier for gap + # open/close + # penalties in + # hydrophobic regions + _Option( + ["-hydrofactor", "hydrofactor"], + "Multiplier for gap penalties in hydrophobic regions", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # log File name None. Log file name + # (delete existing + # file). + _Option(["-log", "log"], "Log file name", filename=True, equate=False), + # loga File name None. Log file name + # (append to existing + # file). + _Option( + ["-loga", "loga"], + "Log file name (append to existing file)", + filename=True, + equate=False, + ), + # matrix File name None. File name for + # substitution matrix + # in NCBI or WU-BLAST + # format. If you + # specify your own + # matrix, you should + # also specify: + # -gapopen + # -gapextend + # -center 0.0 + _Option( + ["-matrix", "matrix"], + "path to NCBI or WU-BLAST format protein substitution " + "matrix - also set -gapopen, -gapextend and -center", + filename=True, + equate=False, + ), + # diagbreak Integer 1 Maximum distance + # between two + # diagonals that + # allows them to + # merge into one + # diagonal. + _Option( + ["-diagbreak", "diagbreak"], + "Maximum distance between two diagonals that allows " + "them to merge into one diagonal", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + _Option( + ["-maxdiagbreak", "maxdiagbreak"], # deprecated 3.8 + "Deprecated in v3.8, use -diagbreak instead.", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # maxhours Floating point None. Maximum time to + # run in hours. The + # actual time may + # exceed requested + # limit by a few + # minutes. Decimals + # are allowed, so 1.5 + # means one hour and + # 30 minutes. + _Option( + ["-maxhours", "maxhours"], + "Maximum time to run in hours", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # maxiters Integer 1, 2 ... 16 Maximum number of + # iterations. + _Option( + ["-maxiters", "maxiters"], + "Maximum number of iterations", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # maxtrees Integer 1 Maximum number of + # new trees to build + # in iteration 2. + _Option( + ["-maxtrees", "maxtrees"], + "Maximum number of trees to build in iteration 2", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # minbestcolscore Floating point [1] Minimum score a + # column must have to + # be an anchor. + _Option( + ["-minbestcolscore", "minbestcolscore"], + "Minimum score a column must have to be an anchor", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # minsmoothscore Floating point [1] Minimum smoothed + # score a column must + # have to be an + # anchor. + _Option( + ["-minsmoothscore", "minsmoothscore"], + "Minimum smoothed score a column must have to be an anchor", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # objscore sp spm Objective score + # ps used by tree + # dp dependent + # xp refinement. + # spf sp=sum-of-pairs + # spm score. (dimer + # approximation) + # spm=sp for < 100 + # seqs, otherwise spf + # dp=dynamic + # programming score. + # ps=average profile- + # sequence score. + # xp=cross profile + # score. + _Option( + ["-objscore", "objscore"], + "Objective score used by tree dependent refinement", + checker_function=lambda x: x in OBJECTIVE_SCORES, + equate=False, + ), + # refinewindow Integer 200 Length of window + # for -refinew. + _Option( + ["-refinewindow", "refinewindow"], + "Length of window for -refinew", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # root1 pseudo pseudo Method used to root + _Option( + ["-root1", "root1"], + "Method used to root tree in iteration 1", + checker_function=lambda x: x in TREE_ROOT_METHODS, + equate=False, + ), + # root2 midlongestspan tree; root1 is + # minavgleafdist used in iteration 1 + # and 2, root2 in + # later iterations. + _Option( + ["-root2", "root2"], + "Method used to root tree in iteration 2", + checker_function=lambda x: x in TREE_ROOT_METHODS, + equate=False, + ), + # scorefile File name None File name where to + # write a score file. + # This contains one + # line for each column + # in the alignment. + # The line contains + # the letters in the + # column followed by + # the average BLOSUM62 + # score over pairs of + # letters in the + # column. + _Option( + ["-scorefile", "scorefile"], + "Score file name, contains one line for each column" + " in the alignment with average BLOSUM62 score", + filename=True, + equate=False, + ), + # seqtype protein auto Sequence type. + # dna (MUSCLE version > 3.8) + # rna (MUSCLE version > 3.8) + # auto + # nucleo (only valid for MUSCLE versions < 3.8) + _Option( + ["-seqtype", "seqtype"], + "Sequence type", + checker_function=lambda x: x in SEQUENCE_TYPES, + equate=False, + ), + # smoothscoreceil Floating point [1] Maximum value of + # column score for + # smoothing purposes. + _Option( + ["-smoothscoreceil", "smoothscoreceil"], + "Maximum value of column score for smoothing", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # smoothwindow Integer 7 Window used for + # anchor column + # smoothing. + _Option( + ["-smoothwindow", "smoothwindow"], + "Window used for anchor column smoothing", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + # spscore File name Compute SP + # objective score of + # multiple alignment. + _Option( + ["-spscore", "spscore"], + "Compute SP objective score of multiple alignment", + filename=True, + equate=False, + ), + # SUEFF Floating point value 0.1 Constant used in + # between 0 and 1. UPGMB clustering. + # Determines the + # relative fraction + # of average linkage + # (SUEFF) vs. nearest + # neighbor linkage + # (1 SUEFF). + _Option( + ["-sueff", "sueff"], + "Constant used in UPGMB clustering", + checker_function=lambda x: isinstance(x, float), + equate=False, + ), + # tree1 File name None Save tree + _Option( + ["-tree1", "tree1"], "Save Newick tree from iteration 1", equate=False + ), + # tree2 first or second + # iteration to given + # file in Newick + # (Phylip-compatible) + # format. + _Option( + ["-tree2", "tree2"], "Save Newick tree from iteration 2", equate=False + ), + # usetree File name None Use given tree as + # guide tree. Must by + # in Newick + # (Phyip-compatible) + # format. + _Option( + ["-usetree", "usetree"], + "Use given Newick tree as guide tree", + filename=True, + equate=False, + ), + # weight1 none clustalw Sequence weighting + _Option( + ["-weight1", "weight1"], + "Weighting scheme used in iteration 1", + checker_function=lambda x: x in WEIGHTING_SCHEMES, + equate=False, + ), + # weight2 henikoff scheme. + # henikoffpb weight1 is used in + # gsc iterations 1 and 2. + # clustalw weight2 is used for + # threeway tree-dependent + # refinement. + # none=all sequences + # have equal weight. + # henikoff=Henikoff & + # Henikoff weighting + # scheme. + # henikoffpb=Modified + # Henikoff scheme as + # used in PSI-BLAST. + # clustalw=CLUSTALW + # method. + # threeway=Gotoh + # three-way method. + _Option( + ["-weight2", "weight2"], + "Weighting scheme used in iteration 2", + checker_function=lambda x: x in WEIGHTING_SCHEMES, + equate=False, + ), + # ################### FORMATS #################################### + # Multiple formats can be specified on the command line + # If -msf appears it will be used regardless of other formats + # specified. If -clw appears (and not -msf), clustalw format will + # be used regardless of other formats specified. If both -clw and + # -clwstrict are specified -clwstrict will be used regardless of + # other formats specified. If -fasta is specified and not -msf, + # -clw, or clwstrict, fasta will be used. If -fasta and -html are + # specified -fasta will be used. Only if -html is specified alone + # will html be used. I kid ye not. + # clw no Write output in CLUSTALW format + # (default is FASTA). + _Switch( + ["-clw", "clw"], + "Write output in CLUSTALW format (with a MUSCLE header)", + ), + # clwstrict no Write output in CLUSTALW format with + # the "CLUSTAL W (1.81)" header rather + # than the MUSCLE version. This is + # useful when a post-processing step is + # picky about the file header. + _Switch( + ["-clwstrict", "clwstrict"], + "Write output in CLUSTALW format with version 1.81 header", + ), + # fasta yes Write output in FASTA format. + # Alternatives include clw, + # clwstrict, msf and html. + _Switch(["-fasta", "fasta"], "Write output in FASTA format"), + # html no Write output in HTML format (default + # is FASTA). + _Switch(["-html", "html"], "Write output in HTML format"), + # msf no Write output in MSF format (default + # is FASTA). + _Switch(["-msf", "msf"], "Write output in MSF format"), + # Phylip interleaved - undocumented as of 3.7 + _Switch(["-phyi", "phyi"], "Write output in PHYLIP interleaved format"), + # Phylip sequential - undocumented as of 3.7 + _Switch(["-phys", "phys"], "Write output in PHYLIP sequential format"), + # ################# Additional specified output files ######### + _Option( + ["-phyiout", "phyiout"], + "Write PHYLIP interleaved output to specified filename", + filename=True, + equate=False, + ), + _Option( + ["-physout", "physout"], + "Write PHYLIP sequential format to specified filename", + filename=True, + equate=False, + ), + _Option( + ["-htmlout", "htmlout"], + "Write HTML output to specified filename", + filename=True, + equate=False, + ), + _Option( + ["-clwout", "clwout"], + "Write CLUSTALW output (with MUSCLE header) to specified filename", + filename=True, + equate=False, + ), + _Option( + ["-clwstrictout", "clwstrictout"], + "Write CLUSTALW output (with version 1.81 header) to " + "specified filename", + filename=True, + equate=False, + ), + _Option( + ["-msfout", "msfout"], + "Write MSF format output to specified filename", + filename=True, + equate=False, + ), + _Option( + ["-fastaout", "fastaout"], + "Write FASTA format output to specified filename", + filename=True, + equate=False, + ), + # ############# END FORMATS ################################### + # anchors yes Use anchor optimization in tree + # dependent refinement iterations. + _Switch( + ["-anchors", "anchors"], + "Use anchor optimisation in tree dependent refinement iterations", + ), + # noanchors no Disable anchor optimization. Default + # is anchors. + _Switch( + ["-noanchors", "noanchors"], + "Do not use anchor optimisation in tree dependent " + "refinement iterations", + ), + # brenner no Use Steven Brenner's method for + # computing the root alignment. + _Switch( + ["-brenner", "brenner"], "Use Steve Brenner's root alignment method" + ), + # cluster no Perform fast clustering of input + # sequences. Use the tree1 option to + # save the tree. + _Switch( + ["-cluster", "cluster"], + "Perform fast clustering of input sequences, " + "use -tree1 to save tree", + ), + # dimer no Use dimer approximation for the + # SP score (faster, less accurate). + _Switch( + ["-dimer", "dimer"], + "Use faster (slightly less accurate) dimer approximation" + "for the SP score", + ), + # group yes Group similar sequences together + # in the output. This is the default. + # See also stable. + _Switch(["-group", "group"], "Group similar sequences in output"), + # ############# log-expectation profile score #################### + # One of either -le, -sp, or -sv + # + # According to the doc, spn is default and the only option for + # nucleotides: this doesn't appear to be true. -le, -sp, and -sv + # can be used and produce numerically different logs + # (what is going on?) + # + # spn fails on proteins + # le maybe Use log-expectation profile score + # (VTML240). Alternatives are to use sp + # or sv. This is the default for amino + # acid sequences. + _Switch(["-le", "le"], "Use log-expectation profile score (VTML240)"), + # sv no Use sum-of-pairs profile score + # (VTML240). Default is le. + _Switch(["-sv", "sv"], "Use sum-of-pairs profile score (VTML240)"), + # sp no Use sum-of-pairs protein profile + # score (PAM200). Default is le. + _Switch(["-sp", "sp"], "Use sum-of-pairs protein profile score (PAM200)"), + # spn maybe Use sum-of-pairs nucleotide profile + # score (BLASTZ parameters). This is + # the only option for nucleotides, + # and is therefore the default. + _Switch( + ["-spn", "spn"], "Use sum-of-pairs protein nucleotide profile score" + ), + # ########## END log-expectation profile score ################### + # quiet no Do not display progress messages. + _Switch(["-quiet", "quiet"], "Do not display progress messages"), + # refine no Input file is already aligned, skip + # first two iterations and begin tree + # dependent refinement. + _Switch(["-refine", "refine"], "Only do tree dependent refinement"), + # refinew no Refine an alignment by dividing it + # into non-overlapping windows and + # re-aligning each window. Typically + # used for whole-genome nucleotide + # alignments. + _Switch( + ["-refinew", "refinew"], + "Only do tree dependent refinement using sliding window approach", + ), + # core yes in muscle, Do not catch exceptions. + # no in muscled. + _Switch(["-core", "core"], "Do not catch exceptions"), + # nocore no in muscle, Catch exceptions and give an + # yes in muscled. error message if possible. + _Switch(["-nocore", "nocore"], "Catch exceptions"), + # stable no Preserve input order of sequences + # in output file. Default is to group + # sequences by similarity (group). + _Switch( + ["-stable", "stable"], + "Do not group similar sequences in output (not supported in v3.8)", + ), + # termgaps4 yes Use 4-way test for treatment of + # terminal gaps. + # (Cannot be disabled in this version). + # + # termgapsfull no Terminal gaps penalized with + # full penalty. [1] Not fully + # supported in this version + # + # termgapshalf yes Terminal gaps penalized with + # half penalty. [1] Not fully + # supported in this version + # + # termgapshalflonger no Terminal gaps penalized with + # half penalty if gap relative + # to longer sequence, otherwise with + # full penalty. [1] Not fully + # supported in this version + # + # verbose no Write parameter settings and + # progress messages to log file. + _Switch(["-verbose", "verbose"], "Write parameter settings and progress"), + # version no Write version string to + # stdout and exit + _Switch(["-version", "version"], "Write version string to stdout and exit"), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/_Prank.py b/code/lib/Bio/Align/Applications/_Prank.py new file mode 100644 index 0000000..4d07c56 --- /dev/null +++ b/code/lib/Bio/Align/Applications/_Prank.py @@ -0,0 +1,236 @@ +# Copyright 2009 by Cymon J. Cox. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple alignment program PRANK.""" + +from Bio.Application import _Option, _Switch, AbstractCommandline + + +class PrankCommandline(AbstractCommandline): + """Command line wrapper for the multiple alignment program PRANK. + + http://www.ebi.ac.uk/goldman-srv/prank/prank/ + + Notes + ----- + Last checked against version: 081202 + + References + ---------- + Loytynoja, A. and Goldman, N. 2005. An algorithm for progressive + multiple alignment of sequences with insertions. Proceedings of + the National Academy of Sciences, 102: 10557--10562. + + Loytynoja, A. and Goldman, N. 2008. Phylogeny-aware gap placement + prevents errors in sequence alignment and evolutionary analysis. + Science, 320: 1632. + + Examples + -------- + To align a FASTA file (unaligned.fasta) with the output in aligned + FASTA format with the output filename starting with "aligned" (you + can't pick the filename explicitly), no tree output and no XML output, + use: + + >>> from Bio.Align.Applications import PrankCommandline + >>> prank_cline = PrankCommandline(d="unaligned.fasta", + ... o="aligned", # prefix only! + ... f=8, # FASTA output + ... notree=True, noxml=True) + >>> print(prank_cline) + prank -d=unaligned.fasta -o=aligned -f=8 -noxml -notree + + You would typically run the command line with prank_cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + """ + + def __init__(self, cmd="prank", **kwargs): + """Initialize the class.""" + OUTPUT_FORMAT_VALUES = list(range(1, 18)) + self.parameters = [ + # ################# input/output parameters: ################## + # -d=sequence_file + _Option(["-d", "d"], "Input filename", filename=True, is_required=True), + # -t=tree_file [default: no tree, generate approximate NJ tree] + _Option(["-t", "t"], "Input guide tree filename", filename=True), + # -tree="tree_string" [tree in newick format; in double quotes] + _Option(["-tree", "tree"], "Input guide tree as Newick string"), + # -m=model_file [default: HKY2/WAG] + _Option( + ["-m", "m"], "User-defined alignment model filename. Default: HKY2/WAG" + ), + # -o=output_file [default: 'output'] + _Option( + ["-o", "o"], + "Output filenames prefix. Default: 'output'\n " + "Will write: output.?.fas (depending on requested " + "format), output.?.xml and output.?.dnd", + filename=True, + ), + # -f=output_format [default: 8] + _Option( + ["-f", "f"], + "Output alignment format. Default: 8 FASTA\n" + "Option are:\n" + "1. IG/Stanford 8. Pearson/Fasta\n" + "2. GenBank/GB 11. Phylip3.2\n" + "3. NBRF 12. Phylip\n" + "4. EMBL 14. PIR/CODATA\n" + "6. DNAStrider 15. MSF\n" + "7. Fitch 17. PAUP/NEXUS", + checker_function=lambda x: x in OUTPUT_FORMAT_VALUES, + ), + _Switch( + ["-noxml", "noxml"], + "Do not output XML files (PRANK versions earlier than v.120626)", + ), + _Switch( + ["-notree", "notree"], + "Do not output dnd tree files (PRANK versions earlier than v.120626)", + ), + _Switch( + ["-showxml", "showxml"], "Output XML files (PRANK v.120626 and later)" + ), + _Switch( + ["-showtree", "showtree"], + "Output dnd tree files (PRANK v.120626 and later)", + ), + _Switch(["-shortnames", "shortnames"], "Truncate names at first space"), + _Switch(["-quiet", "quiet"], "Reduce verbosity"), + # ###################### model parameters: ###################### + # +F [force insertions to be always skipped] + # -F [equivalent] + _Switch( + ["-F", "+F", "F"], "Force insertions to be always skipped: same as +F" + ), + # -dots [show insertion gaps as dots] + _Switch(["-dots", "dots"], "Show insertion gaps as dots"), + # -gaprate=# [gap opening rate; default: dna 0.025 / prot 0.0025] + _Option( + ["-gaprate", "gaprate"], + "Gap opening rate. Default: dna 0.025 prot 0.0025", + checker_function=lambda x: isinstance(x, float), + ), + # -gapext=# [gap extension probability; default: dna 0.5 / prot 0.5] + _Option( + ["-gapext", "gapext"], + "Gap extension probability. Default: dna 0.5 / prot 0.5", + checker_function=lambda x: isinstance(x, float), + ), + # -dnafreqs=#,#,#,# [ACGT; default: empirical] + _Option( + ["-dnafreqs", "dnafreqs"], + "DNA frequencies - 'A,C,G,T'. eg '25,25,25,25' as a quote " + "surrounded string value. Default: empirical", + checker_function=lambda x: isinstance(x, bytes), + ), + # -kappa=# [ts/tv rate ratio; default:2] + _Option( + ["-kappa", "kappa"], + "Transition/transversion ratio. Default: 2", + checker_function=lambda x: isinstance(x, int), + ), + # -rho=# [pur/pyr rate ratio; default:1] + _Option( + ["-rho", "rho"], + "Purine/pyrimidine ratio. Default: 1", + checker_function=lambda x: isinstance(x, int), + ), + # -codon [for DNA: use empirical codon model] + _Switch(["-codon", "codon"], "Codon aware alignment or not"), + # -termgap [penalise terminal gaps normally] + _Switch(["-termgap", "termgap"], "Penalise terminal gaps normally"), + # ############### other parameters: ################################ + # -nopost [do not compute posterior support; default: compute] + _Switch( + ["-nopost", "nopost"], + "Do not compute posterior support. Default: compute", + ), + # -pwdist=# [expected pairwise distance for computing guidetree; + # default: dna 0.25 / prot 0.5] + _Option( + ["-pwdist", "pwdist"], + "Expected pairwise distance for computing guidetree. " + "Default: dna 0.25 / prot 0.5", + checker_function=lambda x: isinstance(x, float), + ), + _Switch( + ["-once", "once"], "Run only once. Default: twice if no guidetree given" + ), + _Switch(["-twice", "twice"], "Always run twice"), + _Switch(["-skipins", "skipins"], "Skip insertions in posterior support"), + _Switch( + ["-uselogs", "uselogs"], + "Slower but should work for a greater number of sequences", + ), + _Switch(["-writeanc", "writeanc"], "Output ancestral sequences"), + _Switch( + ["-printnodes", "printnodes"], "Output each node; mostly for debugging" + ), + # -matresize=# [matrix resizing multiplier] + # Doesn't specify type but Float and Int work + _Option( + ["-matresize", "matresize"], + "Matrix resizing multiplier", + checker_function=lambda x: (isinstance(x, float) or isinstance(x, int)), + ), + # -matinitsize=# [matrix initial size multiplier] + # Doesn't specify type but Float and Int work + _Option( + ["-matinitsize", "matinitsize"], + "Matrix initial size multiplier", + checker_function=lambda x: (isinstance(x, float) or isinstance(x, int)), + ), + _Switch(["-longseq", "longseq"], "Save space in pairwise alignments"), + _Switch(["-pwgenomic", "pwgenomic"], "Do pairwise alignment, no guidetree"), + # -pwgenomicdist=# [distance for pairwise alignment; default: 0.3] + _Option( + ["-pwgenomicdist", "pwgenomicdist"], + "Distance for pairwise alignment. Default: 0.3", + checker_function=lambda x: isinstance(x, float), + ), + # -scalebranches=# [scale branch lengths; default: dna 1 / prot 2] + _Option( + ["-scalebranches", "scalebranches"], + "Scale branch lengths. Default: dna 1 / prot 2", + checker_function=lambda x: isinstance(x, int), + ), + # -fixedbranches=# [use fixed branch lengths] + # Assume looking for a float + _Option( + ["-fixedbranches", "fixedbranches"], + "Use fixed branch lengths of input value", + checker_function=lambda x: isinstance(x, float), + ), + # -maxbranches=# [set maximum branch length] + # Assume looking for a float + _Option( + ["-maxbranches", "maxbranches"], + "Use maximum branch lengths of input value", + checker_function=lambda x: isinstance(x, float), + ), + # -realbranches [disable branch length truncation] + _Switch( + ["-realbranches", "realbranches"], "Disable branch length truncation" + ), + _Switch(["-translate", "translate"], "Translate to protein"), + _Switch( + ["-mttranslate", "mttranslate"], "Translate to protein using mt table" + ), + # ##################### other: #################### + _Switch( + ["-convert", "convert"], + "Convert input alignment to new format. Do not perform alignment", + ), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/_Probcons.py b/code/lib/Bio/Align/Applications/_Probcons.py new file mode 100644 index 0000000..e94e026 --- /dev/null +++ b/code/lib/Bio/Align/Applications/_Probcons.py @@ -0,0 +1,137 @@ +# Copyright 2009 by Cymon J. Cox. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple alignment program PROBCONS.""" + +from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline + + +class ProbconsCommandline(AbstractCommandline): + """Command line wrapper for the multiple alignment program PROBCONS. + + http://probcons.stanford.edu/ + + Notes + ----- + Last checked against version: 1.12 + + References + ---------- + Do, C.B., Mahabhashyam, M.S.P., Brudno, M., and Batzoglou, S. 2005. + PROBCONS: Probabilistic Consistency-based Multiple Sequence Alignment. + Genome Research 15: 330-340. + + Examples + -------- + To align a FASTA file (unaligned.fasta) with the output in ClustalW + format, and otherwise default settings, use: + + >>> from Bio.Align.Applications import ProbconsCommandline + >>> probcons_cline = ProbconsCommandline(input="unaligned.fasta", + ... clustalw=True) + >>> print(probcons_cline) + probcons -clustalw unaligned.fasta + + You would typically run the command line with probcons_cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + Note that PROBCONS will write the alignment to stdout, which you may + want to save to a file and then parse, e.g.:: + + stdout, stderr = probcons_cline() + with open("aligned.aln", "w") as handle: + handle.write(stdout) + from Bio import AlignIO + align = AlignIO.read("aligned.fasta", "clustalw") + + Alternatively, to parse the output with AlignIO directly you can + use StringIO to turn the string into a handle:: + + stdout, stderr = probcons_cline() + from io import StringIO + from Bio import AlignIO + align = AlignIO.read(StringIO(stdout), "clustalw") + + """ + + def __init__(self, cmd="probcons", **kwargs): + """Initialize the class.""" + self.parameters = [ + # Note that some options cannot be assigned via properties using the + # original documented option (because hyphens are not valid for names in + # python), e.g cmdline.pre-training = 3 will not work + # In these cases the shortened option name should be used + # cmdline.pre = 3 + _Switch( + ["-clustalw", "clustalw"], "Use CLUSTALW output format instead of MFA" + ), + _Option( + ["-c", "c", "--consistency", "consistency"], + "Use 0 <= REPS <= 5 (default: 2) passes of consistency transformation", + checker_function=lambda x: x in range(0, 6), + equate=False, + ), + _Option( + ["-ir", "--iterative-refinement", "iterative-refinement", "ir"], + "Use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement", + checker_function=lambda x: x in range(0, 1001), + equate=False, + ), + _Option( + ["-pre", "--pre-training", "pre-training", "pre"], + "Use 0 <= REPS <= 20 (default: 0) rounds of pretraining", + checker_function=lambda x: x in range(0, 21), + equate=False, + ), + _Switch(["-pairs", "pairs"], "Generate all-pairs pairwise alignments"), + _Switch( + ["-viterbi", "viterbi"], + "Use Viterbi algorithm to generate all pairs " + "(automatically enables -pairs)", + ), + _Switch( + ["-verbose", "verbose"], "Report progress while aligning (default: off)" + ), + _Option( + ["-annot", "annot"], + "Write annotation for multiple alignment to FILENAME", + equate=False, + ), + _Option( + ["-t", "t", "--train", "train"], + "Compute EM transition probabilities, store in FILENAME " + "(default: no training)", + equate=False, + ), + _Switch( + ["-e", "e", "--emissions", "emissions"], + "Also reestimate emission probabilities (default: off)", + ), + _Option( + ["-p", "p", "--paramfile", "paramfile"], + "Read parameters from FILENAME", + equate=False, + ), + _Switch( + ["-a", "--alignment-order", "alignment-order", "a"], + "Print sequences in alignment order rather than input " + "order (default: off)", + ), + # Input file name + _Argument( + ["input"], + "Input file name. Must be multiple FASTA alignment (MFA) format", + filename=True, + is_required=True, + ), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/_TCoffee.py b/code/lib/Bio/Align/Applications/_TCoffee.py new file mode 100644 index 0000000..de337bc --- /dev/null +++ b/code/lib/Bio/Align/Applications/_TCoffee.py @@ -0,0 +1,125 @@ +# Copyright 2009 by Cymon J. Cox and Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Command line wrapper for the multiple alignment program TCOFFEE.""" + + +from Bio.Application import _Option, _Switch, AbstractCommandline + + +class TCoffeeCommandline(AbstractCommandline): + """Commandline object for the TCoffee alignment program. + + http://www.tcoffee.org/Projects_home_page/t_coffee_home_page.html + + The T-Coffee command line tool has a lot of switches and options. + This wrapper implements a VERY limited number of options - if you + would like to help improve it please get in touch. + + Notes + ----- + Last checked against: Version_6.92 + + References + ---------- + T-Coffee: A novel method for multiple sequence alignments. + Notredame, Higgins, Heringa, JMB,302(205-217) 2000 + + Examples + -------- + To align a FASTA file (unaligned.fasta) with the output in ClustalW + format (file aligned.aln), and otherwise default settings, use: + + >>> from Bio.Align.Applications import TCoffeeCommandline + >>> tcoffee_cline = TCoffeeCommandline(infile="unaligned.fasta", + ... output="clustalw", + ... outfile="aligned.aln") + >>> print(tcoffee_cline) + t_coffee -output clustalw -infile unaligned.fasta -outfile aligned.aln + + You would typically run the command line with tcoffee_cline() or via + the Python subprocess module, as described in the Biopython tutorial. + + """ + + SEQ_TYPES = ["dna", "protein", "dna_protein"] + + def __init__(self, cmd="t_coffee", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-output", "output"], + """Specify the output type. + + One (or more separated by a comma) of: + 'clustalw_aln', 'clustalw', 'gcg', 'msf_aln', + 'pir_aln', 'fasta_aln', 'phylip', 'pir_seq', 'fasta_seq' + """, + equate=False, + ), + _Option( + ["-infile", "infile"], + "Specify the input file.", + filename=True, + is_required=True, + equate=False, + ), + # Indicates the name of the alignment output by t_coffee. If the + # default is used, the alignment is named .aln + _Option( + ["-outfile", "outfile"], + "Specify the output file. Default: .aln", + filename=True, + equate=False, + ), + _Switch( + ["-convert", "convert"], "Specify you want to perform a file conversion" + ), + _Option( + ["-type", "type"], + "Specify the type of sequence being aligned", + checker_function=lambda x: x in self.SEQ_TYPES, + equate=False, + ), + _Option( + ["-outorder", "outorder"], + "Specify the order of sequence to output" + "Either 'input', 'aligned' or of " + "Fasta file with sequence order", + equate=False, + ), + _Option( + ["-matrix", "matrix"], + "Specify the filename of the substitution matrix to use. " + "Default: blosum62mt", + equate=False, + ), + _Option( + ["-gapopen", "gapopen"], + "Indicates the penalty applied for opening a gap (negative integer)", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + _Option( + ["-gapext", "gapext"], + "Indicates the penalty applied for extending a gap (negative integer)", + checker_function=lambda x: isinstance(x, int), + equate=False, + ), + _Switch(["-quiet", "quiet"], "Turn off log output"), + _Option( + ["-mode", "mode"], + "Specifies a special mode: genome, quickaln, dali, 3dcoffee", + equate=False, + ), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/Applications/__init__.py b/code/lib/Bio/Align/Applications/__init__.py new file mode 100644 index 0000000..778a7dd --- /dev/null +++ b/code/lib/Bio/Align/Applications/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2009 by Peter Cock & Cymon J. Cox. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Alignment command line tool wrappers (OBSOLETE). + +We have decided to remove this module in future, and instead recommend +building your command and invoking it via the subprocess module directly. +""" + +from ._Muscle import MuscleCommandline +from ._Clustalw import ClustalwCommandline +from ._ClustalOmega import ClustalOmegaCommandline +from ._Prank import PrankCommandline +from ._Mafft import MafftCommandline +from ._Dialign import DialignCommandline +from ._Probcons import ProbconsCommandline +from ._TCoffee import TCoffeeCommandline +from ._MSAProbs import MSAProbsCommandline + +# Make this explicit, then they show up in the API docs +__all__ = ( + "MuscleCommandline", + "ClustalwCommandline", + "ClustalOmegaCommandline", + "PrankCommandline", + "MafftCommandline", + "DialignCommandline", + "ProbconsCommandline", + "TCoffeeCommandline", + "MSAProbsCommandline", +) diff --git a/code/lib/Bio/Align/Applications/__pycache__/_ClustalOmega.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_ClustalOmega.cpython-37.pyc new file mode 100644 index 0000000..047e02a Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_ClustalOmega.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Clustalw.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Clustalw.cpython-37.pyc new file mode 100644 index 0000000..5cfed83 Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Clustalw.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Dialign.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Dialign.cpython-37.pyc new file mode 100644 index 0000000..48be4d1 Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Dialign.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/_MSAProbs.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_MSAProbs.cpython-37.pyc new file mode 100644 index 0000000..9a47d9c Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_MSAProbs.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Mafft.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Mafft.cpython-37.pyc new file mode 100644 index 0000000..d7dc9b7 Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Mafft.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Muscle.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Muscle.cpython-37.pyc new file mode 100644 index 0000000..1fc62ff Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Muscle.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Prank.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Prank.cpython-37.pyc new file mode 100644 index 0000000..191a273 Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Prank.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Probcons.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Probcons.cpython-37.pyc new file mode 100644 index 0000000..a0b18ca Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Probcons.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/_TCoffee.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_TCoffee.cpython-37.pyc new file mode 100644 index 0000000..a2f271d Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_TCoffee.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/Applications/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..5912300 Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/__init__.py b/code/lib/Bio/Align/__init__.py new file mode 100644 index 0000000..ac5b1cd --- /dev/null +++ b/code/lib/Bio/Align/__init__.py @@ -0,0 +1,2326 @@ +# Copyright 2000, 2004 by Brad Chapman. +# Revisions copyright 2010-2013, 2015-2018 by Peter Cock. +# All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Code for dealing with sequence alignments. + +One of the most important things in this module is the MultipleSeqAlignment +class, used in the Bio.AlignIO module. + +""" + +import sys + +from Bio.Align import _aligners +from Bio.Align import substitution_matrices +from Bio.Seq import Seq, MutableSeq, reverse_complement, UndefinedSequenceError +from Bio.SeqRecord import SeqRecord, _RestrictedDict + +# Import errors may occur here if a compiled aligners.c file +# (_aligners.pyd or _aligners.so) is missing or if the user is +# importing from within the Biopython source tree, see PR #2007: +# https://github.com/biopython/biopython/pull/2007 + + +class MultipleSeqAlignment: + """Represents a classical multiple sequence alignment (MSA). + + By this we mean a collection of sequences (usually shown as rows) which + are all the same length (usually with gap characters for insertions or + padding). The data can then be regarded as a matrix of letters, with well + defined columns. + + You would typically create an MSA by loading an alignment file with the + AlignIO module: + + >>> from Bio import AlignIO + >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") + >>> print(align) + Alignment with 7 rows and 156 columns + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 + TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 + + In some respects you can treat these objects as lists of SeqRecord objects, + each representing a row of the alignment. Iterating over an alignment gives + the SeqRecord object for each row: + + >>> len(align) + 7 + >>> for record in align: + ... print("%s %i" % (record.id, len(record))) + ... + gi|6273285|gb|AF191659.1|AF191 156 + gi|6273284|gb|AF191658.1|AF191 156 + gi|6273287|gb|AF191661.1|AF191 156 + gi|6273286|gb|AF191660.1|AF191 156 + gi|6273290|gb|AF191664.1|AF191 156 + gi|6273289|gb|AF191663.1|AF191 156 + gi|6273291|gb|AF191665.1|AF191 156 + + You can also access individual rows as SeqRecord objects via their index: + + >>> print(align[0].id) + gi|6273285|gb|AF191659.1|AF191 + >>> print(align[-1].id) + gi|6273291|gb|AF191665.1|AF191 + + And extract columns as strings: + + >>> print(align[:, 1]) + AAAAAAA + + Or, take just the first ten columns as a sub-alignment: + + >>> print(align[:, :10]) + Alignment with 7 rows and 10 columns + TATACATTAA gi|6273285|gb|AF191659.1|AF191 + TATACATTAA gi|6273284|gb|AF191658.1|AF191 + TATACATTAA gi|6273287|gb|AF191661.1|AF191 + TATACATAAA gi|6273286|gb|AF191660.1|AF191 + TATACATTAA gi|6273290|gb|AF191664.1|AF191 + TATACATTAA gi|6273289|gb|AF191663.1|AF191 + TATACATTAA gi|6273291|gb|AF191665.1|AF191 + + Combining this alignment slicing with alignment addition allows you to + remove a section of the alignment. For example, taking just the first + and last ten columns: + + >>> print(align[:, :10] + align[:, -10:]) + Alignment with 7 rows and 20 columns + TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191 + TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191 + TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191 + TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191 + TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191 + TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191 + TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191 + + Note - This object replaced the older Alignment object defined in module + Bio.Align.Generic but is not fully backwards compatible with it. + + Note - This object does NOT attempt to model the kind of alignments used + in next generation sequencing with multiple sequencing reads which are + much shorter than the alignment, and where there is usually a consensus or + reference sequence with special status. + """ + + def __init__( + self, records, alphabet=None, annotations=None, column_annotations=None + ): + """Initialize a new MultipleSeqAlignment object. + + Arguments: + - records - A list (or iterator) of SeqRecord objects, whose + sequences are all the same length. This may be an be an + empty list. + - alphabet - For backward compatibility only; its value should always + be None. + - annotations - Information about the whole alignment (dictionary). + - column_annotations - Per column annotation (restricted dictionary). + This holds Python sequences (lists, strings, tuples) + whose length matches the number of columns. A typical + use would be a secondary structure consensus string. + + You would normally load a MSA from a file using Bio.AlignIO, but you + can do this from a list of SeqRecord objects too: + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Align import MultipleSeqAlignment + >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha") + >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta") + >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma") + >>> align = MultipleSeqAlignment([a, b, c], + ... annotations={"tool": "demo"}, + ... column_annotations={"stats": "CCCXCCC"}) + >>> print(align) + Alignment with 3 rows and 7 columns + AAAACGT Alpha + AAA-CGT Beta + AAAAGGT Gamma + >>> align.annotations + {'tool': 'demo'} + >>> align.column_annotations + {'stats': 'CCCXCCC'} + """ + if alphabet is not None: + raise ValueError("The alphabet argument is no longer supported") + + self._records = [] + if records: + self.extend(records) + + # Annotations about the whole alignment + if annotations is None: + annotations = {} + elif not isinstance(annotations, dict): + raise TypeError("annotations argument should be a dict") + self.annotations = annotations + + # Annotations about each column of the alignment + if column_annotations is None: + column_annotations = {} + # Handle this via the property set function which will validate it + self.column_annotations = column_annotations + + def _set_per_column_annotations(self, value): + if not isinstance(value, dict): + raise TypeError( + "The per-column-annotations should be a (restricted) dictionary." + ) + # Turn this into a restricted-dictionary (and check the entries) + if len(self): + # Use the standard method to get the length + expected_length = self.get_alignment_length() + self._per_col_annotations = _RestrictedDict(length=expected_length) + self._per_col_annotations.update(value) + else: + # Bit of a problem case... number of columns is undefined + self._per_col_annotations = None + if value: + raise ValueError( + "Can't set per-column-annotations without an alignment" + ) + + def _get_per_column_annotations(self): + if self._per_col_annotations is None: + # This happens if empty at initialisation + if len(self): + # Use the standard method to get the length + expected_length = self.get_alignment_length() + else: + # Should this raise an exception? Compare SeqRecord behaviour... + expected_length = 0 + self._per_col_annotations = _RestrictedDict(length=expected_length) + return self._per_col_annotations + + column_annotations = property( + fget=_get_per_column_annotations, + fset=_set_per_column_annotations, + doc="""Dictionary of per-letter-annotation for the sequence.""", + ) + + def _str_line(self, record, length=50): + """Return a truncated string representation of a SeqRecord (PRIVATE). + + This is a PRIVATE function used by the __str__ method. + """ + if record.seq.__class__.__name__ == "CodonSeq": + if len(record.seq) <= length: + return "%s %s" % (record.seq, record.id) + else: + return "%s...%s %s" % ( + record.seq[: length - 3], + record.seq[-3:], + record.id, + ) + else: + if len(record.seq) <= length: + return "%s %s" % (record.seq, record.id) + else: + return "%s...%s %s" % ( + record.seq[: length - 6], + record.seq[-3:], + record.id, + ) + + def __str__(self): + """Return a multi-line string summary of the alignment. + + This output is intended to be readable, but large alignments are + shown truncated. A maximum of 20 rows (sequences) and 50 columns + are shown, with the record identifiers. This should fit nicely on a + single screen. e.g. + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Align import MultipleSeqAlignment + >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha") + >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta") + >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma") + >>> align = MultipleSeqAlignment([a, b, c]) + >>> print(align) + Alignment with 3 rows and 12 columns + ACTGCTAGCTAG Alpha + ACT-CTAGCTAG Beta + ACTGCTAGATAG Gamma + + See also the alignment's format method. + """ + rows = len(self._records) + lines = [ + "Alignment with %i rows and %i columns" + % (rows, self.get_alignment_length()) + ] + if rows <= 20: + lines.extend(self._str_line(rec) for rec in self._records) + else: + lines.extend(self._str_line(rec) for rec in self._records[:18]) + lines.append("...") + lines.append(self._str_line(self._records[-1])) + return "\n".join(lines) + + def __repr__(self): + """Return a representation of the object for debugging. + + The representation cannot be used with eval() to recreate the object, + which is usually possible with simple python objects. For example: + + + + The hex string is the memory address of the object, see help(id). + This provides a simple way to visually distinguish alignments of + the same size. + """ + # A doctest for __repr__ would be nice, but __class__ comes out differently + # if run via the __main__ trick. + return "<%s instance (%i records of length %i) at %x>" % ( + self.__class__, + len(self._records), + self.get_alignment_length(), + id(self), + ) + # This version is useful for doing eval(repr(alignment)), + # but it can be VERY long: + # return "%s(%r)" \ + # % (self.__class__, self._records) + + def __format__(self, format_spec): + """Return the alignment as a string in the specified file format. + + The format should be a lower case string supported as an output + format by Bio.AlignIO (such as "fasta", "clustal", "phylip", + "stockholm", etc), which is used to turn the alignment into a + string. + + e.g. + + >>> from Bio.Align import MultipleSeqAlignment + >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha", description="") + >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta", description="") + >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma", description="") + >>> align = MultipleSeqAlignment([a, b, c]) + >>> print(format(align, "fasta")) + >Alpha + ACTGCTAGCTAG + >Beta + ACT-CTAGCTAG + >Gamma + ACTGCTAGATAG + + >>> print(format(align, "phylip")) + 3 12 + Alpha ACTGCTAGCT AG + Beta ACT-CTAGCT AG + Gamma ACTGCTAGAT AG + + """ + if format_spec: + from io import StringIO + from Bio import AlignIO + + handle = StringIO() + AlignIO.write([self], handle, format_spec) + return handle.getvalue() + else: + # Follow python convention and default to using __str__ + return str(self) + + def __iter__(self): + """Iterate over alignment rows as SeqRecord objects. + + e.g. + + >>> from Bio.Align import MultipleSeqAlignment + >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha") + >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta") + >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma") + >>> align = MultipleSeqAlignment([a, b, c]) + >>> for record in align: + ... print(record.id) + ... print(record.seq) + ... + Alpha + ACTGCTAGCTAG + Beta + ACT-CTAGCTAG + Gamma + ACTGCTAGATAG + """ + return iter(self._records) + + def __len__(self): + """Return the number of sequences in the alignment. + + Use len(alignment) to get the number of sequences (i.e. the number of + rows), and alignment.get_alignment_length() to get the length of the + longest sequence (i.e. the number of columns). + + This is easy to remember if you think of the alignment as being like a + list of SeqRecord objects. + """ + return len(self._records) + + def get_alignment_length(self): + """Return the maximum length of the alignment. + + All objects in the alignment should (hopefully) have the same + length. This function will go through and find this length + by finding the maximum length of sequences in the alignment. + + >>> from Bio.Align import MultipleSeqAlignment + >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha") + >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta") + >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma") + >>> align = MultipleSeqAlignment([a, b, c]) + >>> align.get_alignment_length() + 12 + + If you want to know the number of sequences in the alignment, + use len(align) instead: + + >>> len(align) + 3 + + """ + max_length = 0 + + for record in self._records: + if len(record.seq) > max_length: + max_length = len(record.seq) + + return max_length + + def extend(self, records): + """Add more SeqRecord objects to the alignment as rows. + + They must all have the same length as the original alignment. For + example, + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Align import MultipleSeqAlignment + >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha") + >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta") + >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma") + >>> d = SeqRecord(Seq("AAAACGT"), id="Delta") + >>> e = SeqRecord(Seq("AAA-GGT"), id="Epsilon") + + First we create a small alignment (three rows): + + >>> align = MultipleSeqAlignment([a, b, c]) + >>> print(align) + Alignment with 3 rows and 7 columns + AAAACGT Alpha + AAA-CGT Beta + AAAAGGT Gamma + + Now we can extend this alignment with another two rows: + + >>> align.extend([d, e]) + >>> print(align) + Alignment with 5 rows and 7 columns + AAAACGT Alpha + AAA-CGT Beta + AAAAGGT Gamma + AAAACGT Delta + AAA-GGT Epsilon + + Because the alignment object allows iteration over the rows as + SeqRecords, you can use the extend method with a second alignment + (provided its sequences have the same length as the original alignment). + """ + if len(self): + # Use the standard method to get the length + expected_length = self.get_alignment_length() + else: + # Take the first record's length + records = iter(records) # records arg could be list or iterator + try: + rec = next(records) + except StopIteration: + # Special case, no records + return + expected_length = len(rec) + self._append(rec, expected_length) + # Can now setup the per-column-annotations as well, set to None + # while missing the length: + self.column_annotations = {} + # Now continue to the rest of the records as usual + + for rec in records: + self._append(rec, expected_length) + + def append(self, record): + """Add one more SeqRecord object to the alignment as a new row. + + This must have the same length as the original alignment (unless this is + the first record). + + >>> from Bio import AlignIO + >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") + >>> print(align) + Alignment with 7 rows and 156 columns + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 + TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 + >>> len(align) + 7 + + We'll now construct a dummy record to append as an example: + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> dummy = SeqRecord(Seq("N"*156), id="dummy") + + Now append this to the alignment, + + >>> align.append(dummy) + >>> print(align) + Alignment with 8 rows and 156 columns + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 + TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 + TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 + TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 + NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy + >>> len(align) + 8 + + """ + if self._records: + self._append(record, self.get_alignment_length()) + else: + self._append(record) + + def _append(self, record, expected_length=None): + """Validate and append a record (PRIVATE).""" + if not isinstance(record, SeqRecord): + raise TypeError("New sequence is not a SeqRecord object") + + # Currently the get_alignment_length() call is expensive, so we need + # to avoid calling it repeatedly for __init__ and extend, hence this + # private _append method + if expected_length is not None and len(record) != expected_length: + # TODO - Use the following more helpful error, but update unit tests + # raise ValueError("New sequence is not of length %i" + # % self.get_alignment_length()) + raise ValueError("Sequences must all be the same length") + + self._records.append(record) + + def __add__(self, other): + """Combine two alignments with the same number of rows by adding them. + + If you have two multiple sequence alignments (MSAs), there are two ways to think + about adding them - by row or by column. Using the extend method adds by row. + Using the addition operator adds by column. For example, + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Align import MultipleSeqAlignment + >>> a1 = SeqRecord(Seq("AAAAC"), id="Alpha") + >>> b1 = SeqRecord(Seq("AAA-C"), id="Beta") + >>> c1 = SeqRecord(Seq("AAAAG"), id="Gamma") + >>> a2 = SeqRecord(Seq("GT"), id="Alpha") + >>> b2 = SeqRecord(Seq("GT"), id="Beta") + >>> c2 = SeqRecord(Seq("GT"), id="Gamma") + >>> left = MultipleSeqAlignment([a1, b1, c1], + ... annotations={"tool": "demo", "name": "start"}, + ... column_annotations={"stats": "CCCXC"}) + >>> right = MultipleSeqAlignment([a2, b2, c2], + ... annotations={"tool": "demo", "name": "end"}, + ... column_annotations={"stats": "CC"}) + + Now, let's look at these two alignments: + + >>> print(left) + Alignment with 3 rows and 5 columns + AAAAC Alpha + AAA-C Beta + AAAAG Gamma + >>> print(right) + Alignment with 3 rows and 2 columns + GT Alpha + GT Beta + GT Gamma + + And add them: + + >>> combined = left + right + >>> print(combined) + Alignment with 3 rows and 7 columns + AAAACGT Alpha + AAA-CGT Beta + AAAAGGT Gamma + + For this to work, both alignments must have the same number of records (here + they both have 3 rows): + + >>> len(left) + 3 + >>> len(right) + 3 + >>> len(combined) + 3 + + The individual rows are SeqRecord objects, and these can be added together. Refer + to the SeqRecord documentation for details of how the annotation is handled. This + example is a special case in that both original alignments shared the same names, + meaning when the rows are added they also get the same name. + + Any common annotations are preserved, but differing annotation is lost. This is + the same behaviour used in the SeqRecord annotations and is designed to prevent + accidental propagation of inappropriate values: + + >>> combined.annotations + {'tool': 'demo'} + + Similarly any common per-column-annotations are combined: + + >>> combined.column_annotations + {'stats': 'CCCXCCC'} + + """ + if not isinstance(other, MultipleSeqAlignment): + raise NotImplementedError + if len(self) != len(other): + raise ValueError( + "When adding two alignments they must have the same length" + " (i.e. same number or rows)" + ) + merged = (left + right for left, right in zip(self, other)) + # Take any common annotation: + annotations = {} + for k, v in self.annotations.items(): + if k in other.annotations and other.annotations[k] == v: + annotations[k] = v + column_annotations = {} + for k, v in self.column_annotations.items(): + if k in other.column_annotations: + column_annotations[k] = v + other.column_annotations[k] + return MultipleSeqAlignment( + merged, annotations=annotations, column_annotations=column_annotations + ) + + def __getitem__(self, index): + """Access part of the alignment. + + Depending on the indices, you can get a SeqRecord object + (representing a single row), a Seq object (for a single columns), + a string (for a single characters) or another alignment + (representing some part or all of the alignment). + + align[r,c] gives a single character as a string + align[r] gives a row as a SeqRecord + align[r,:] gives a row as a SeqRecord + align[:,c] gives a column as a Seq + + align[:] and align[:,:] give a copy of the alignment + + Anything else gives a sub alignment, e.g. + align[0:2] or align[0:2,:] uses only row 0 and 1 + align[:,1:3] uses only columns 1 and 2 + align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2 + + We'll use the following example alignment here for illustration: + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Align import MultipleSeqAlignment + >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha") + >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta") + >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma") + >>> d = SeqRecord(Seq("AAAACGT"), id="Delta") + >>> e = SeqRecord(Seq("AAA-GGT"), id="Epsilon") + >>> align = MultipleSeqAlignment([a, b, c, d, e]) + + You can access a row of the alignment as a SeqRecord using an integer + index (think of the alignment as a list of SeqRecord objects here): + + >>> first_record = align[0] + >>> print("%s %s" % (first_record.id, first_record.seq)) + Alpha AAAACGT + >>> last_record = align[-1] + >>> print("%s %s" % (last_record.id, last_record.seq)) + Epsilon AAA-GGT + + You can also access use python's slice notation to create a sub-alignment + containing only some of the SeqRecord objects: + + >>> sub_alignment = align[2:5] + >>> print(sub_alignment) + Alignment with 3 rows and 7 columns + AAAAGGT Gamma + AAAACGT Delta + AAA-GGT Epsilon + + This includes support for a step, i.e. align[start:end:step], which + can be used to select every second sequence: + + >>> sub_alignment = align[::2] + >>> print(sub_alignment) + Alignment with 3 rows and 7 columns + AAAACGT Alpha + AAAAGGT Gamma + AAA-GGT Epsilon + + Or to get a copy of the alignment with the rows in reverse order: + + >>> rev_alignment = align[::-1] + >>> print(rev_alignment) + Alignment with 5 rows and 7 columns + AAA-GGT Epsilon + AAAACGT Delta + AAAAGGT Gamma + AAA-CGT Beta + AAAACGT Alpha + + You can also use two indices to specify both rows and columns. Using simple + integers gives you the entry as a single character string. e.g. + + >>> align[3, 4] + 'C' + + This is equivalent to: + + >>> align[3][4] + 'C' + + or: + + >>> align[3].seq[4] + 'C' + + To get a single column (as a string) use this syntax: + + >>> align[:, 4] + 'CCGCG' + + Or, to get part of a column, + + >>> align[1:3, 4] + 'CG' + + However, in general you get a sub-alignment, + + >>> print(align[1:5, 3:6]) + Alignment with 4 rows and 3 columns + -CG Beta + AGG Gamma + ACG Delta + -GG Epsilon + + This should all seem familiar to anyone who has used the NumPy + array or matrix objects. + """ + if isinstance(index, int): + # e.g. result = align[x] + # Return a SeqRecord + return self._records[index] + elif isinstance(index, slice): + # e.g. sub_align = align[i:j:k] + new = MultipleSeqAlignment(self._records[index]) + if self.column_annotations and len(new) == len(self): + # All rows kept (although could have been reversed) + # Preserve the column annotations too, + for k, v in self.column_annotations.items(): + new.column_annotations[k] = v + return new + elif len(index) != 2: + raise TypeError("Invalid index type.") + + # Handle double indexing + row_index, col_index = index + if isinstance(row_index, int): + # e.g. row_or_part_row = align[6, 1:4], gives a SeqRecord + return self._records[row_index][col_index] + elif isinstance(col_index, int): + # e.g. col_or_part_col = align[1:5, 6], gives a string + return "".join(rec[col_index] for rec in self._records[row_index]) + else: + # e.g. sub_align = align[1:4, 5:7], gives another alignment + new = MultipleSeqAlignment( + rec[col_index] for rec in self._records[row_index] + ) + if self.column_annotations and len(new) == len(self): + # All rows kept (although could have been reversed) + # Preserve the column annotations too, + for k, v in self.column_annotations.items(): + new.column_annotations[k] = v[col_index] + return new + + def sort(self, key=None, reverse=False): + """Sort the rows (SeqRecord objects) of the alignment in place. + + This sorts the rows alphabetically using the SeqRecord object id by + default. The sorting can be controlled by supplying a key function + which must map each SeqRecord to a sort value. + + This is useful if you want to add two alignments which use the same + record identifiers, but in a different order. For example, + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Align import MultipleSeqAlignment + >>> align1 = MultipleSeqAlignment([ + ... SeqRecord(Seq("ACGT"), id="Human"), + ... SeqRecord(Seq("ACGG"), id="Mouse"), + ... SeqRecord(Seq("ACGC"), id="Chicken"), + ... ]) + >>> align2 = MultipleSeqAlignment([ + ... SeqRecord(Seq("CGGT"), id="Mouse"), + ... SeqRecord(Seq("CGTT"), id="Human"), + ... SeqRecord(Seq("CGCT"), id="Chicken"), + ... ]) + + If you simple try and add these without sorting, you get this: + + >>> print(align1 + align2) + Alignment with 3 rows and 8 columns + ACGTCGGT + ACGGCGTT + ACGCCGCT Chicken + + Consult the SeqRecord documentation which explains why you get a + default value when annotation like the identifier doesn't match up. + However, if we sort the alignments first, then add them we get the + desired result: + + >>> align1.sort() + >>> align2.sort() + >>> print(align1 + align2) + Alignment with 3 rows and 8 columns + ACGCCGCT Chicken + ACGTCGTT Human + ACGGCGGT Mouse + + As an example using a different sort order, you could sort on the + GC content of each sequence. + + >>> from Bio.SeqUtils import GC + >>> print(align1) + Alignment with 3 rows and 4 columns + ACGC Chicken + ACGT Human + ACGG Mouse + >>> align1.sort(key = lambda record: GC(record.seq)) + >>> print(align1) + Alignment with 3 rows and 4 columns + ACGT Human + ACGC Chicken + ACGG Mouse + + There is also a reverse argument, so if you wanted to sort by ID + but backwards: + + >>> align1.sort(reverse=True) + >>> print(align1) + Alignment with 3 rows and 4 columns + ACGG Mouse + ACGT Human + ACGC Chicken + + """ + if key is None: + self._records.sort(key=lambda r: r.id, reverse=reverse) + else: + self._records.sort(key=key, reverse=reverse) + + @property + def substitutions(self): + """Return an Array with the number of substitutions of letters in the alignment. + + As an example, consider a multiple sequence alignment of three DNA sequences: + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Align import MultipleSeqAlignment + >>> seq1 = SeqRecord(Seq("ACGT"), id="seq1") + >>> seq2 = SeqRecord(Seq("A--A"), id="seq2") + >>> seq3 = SeqRecord(Seq("ACGT"), id="seq3") + >>> seq4 = SeqRecord(Seq("TTTC"), id="seq4") + >>> alignment = MultipleSeqAlignment([seq1, seq2, seq3, seq4]) + >>> print(alignment) + Alignment with 4 rows and 4 columns + ACGT seq1 + A--A seq2 + ACGT seq3 + TTTC seq4 + + >>> m = alignment.substitutions + >>> print(m) + A C G T + A 3.0 0.5 0.0 2.5 + C 0.5 1.0 0.0 2.0 + G 0.0 0.0 1.0 1.0 + T 2.5 2.0 1.0 1.0 + + + Note that the matrix is symmetric, with counts divided equally on both + sides of the diagonal. For example, the total number of substitutions + between A and T in the alignment is 3.5 + 3.5 = 7. + + Any weights associated with the sequences are taken into account when + calculating the substitution matrix. For example, given the following + multiple sequence alignment:: + + GTATC 0.5 + AT--C 0.8 + CTGTC 1.0 + + For the first column we have:: + + ('A', 'G') : 0.5 * 0.8 = 0.4 + ('C', 'G') : 0.5 * 1.0 = 0.5 + ('A', 'C') : 0.8 * 1.0 = 0.8 + + """ + letters = set.union(*[set(record.seq) for record in self]) + try: + letters.remove("-") + except KeyError: + pass + letters = "".join(sorted(letters)) + m = substitution_matrices.Array(letters, dims=2) + for rec_num1, alignment1 in enumerate(self): + seq1 = alignment1.seq + weight1 = alignment1.annotations.get("weight", 1.0) + for rec_num2, alignment2 in enumerate(self): + if rec_num1 == rec_num2: + break + seq2 = alignment2.seq + weight2 = alignment2.annotations.get("weight", 1.0) + for residue1, residue2 in zip(seq1, seq2): + if residue1 == "-": + continue + if residue2 == "-": + continue + m[(residue1, residue2)] += weight1 * weight2 + + m += m.transpose() + m /= 2.0 + + return m + + +class PairwiseAlignment: + """Represents a pairwise sequence alignment. + + Internally, the pairwise alignment is stored as the path through + the traceback matrix, i.e. a tuple of pairs of indices corresponding + to the vertices of the path in the traceback matrix. + """ + + def __init__(self, target, query, path, score): + """Initialize a new PairwiseAlignment object. + + Arguments: + - target - The first sequence, as a plain string, without gaps. + - query - The second sequence, as a plain string, without gaps. + - path - The path through the traceback matrix, defining an + alignment. + - score - The alignment score. + + You would normally obtain a PairwiseAlignment object by iterating + over a PairwiseAlignments object. + """ + self.target = target + self.query = query + self.score = score + self.path = path + + def __eq__(self, other): + return self.path == other.path + + def __ne__(self, other): + return self.path != other.path + + def __lt__(self, other): + return self.path < other.path + + def __le__(self, other): + return self.path <= other.path + + def __gt__(self, other): + return self.path > other.path + + def __ge__(self, other): + return self.path >= other.path + + def __getitem__(self, key): + """Return self[key]. + + Currently, this is implemented only for indices of the form + + self[:, :] + + which returns a copy of the PairwiseAlignment object, and + + self[:, i:] + self[:, :j] + self[:, i:j] + + which returns a new PairwiseAlignment object spanning the indicated + columns. + + >>> from Bio.Align import PairwiseAligner + >>> aligner = PairwiseAligner() + >>> alignments = aligner.align("ACCGGTTT", "ACGGGTT") + >>> alignment = alignments[0] + >>> print(alignment) + ACCGG-TTT + ||-||-||- + AC-GGGTT- + + >>> alignment[:, 1:] # doctest:+ELLIPSIS + + >>> print(alignment[:, 1:]) + ACCGG-TTT + |-||-||- + AC-GGGTT- + + >>> print(alignment[:, 2:]) + ACCGG-TTT + -||-||- + AC-GGGTT- + + >>> print(alignment[:, 3:]) + ACCGG-TTT + ||-||- + ACGGGTT- + + >>> print(alignment[:, 3:-1]) + ACCGG-TTT + ||-|| + ACGGGTT + + """ + if isinstance(key, slice): + if key.indices(len(self)) == (0, 2, 1): + target = self.target + query = self.query + path = self.path + score = self.score + return PairwiseAlignment(target, query, path, score) + raise NotImplementedError + if isinstance(key, int): + raise NotImplementedError + if isinstance(key, tuple): + try: + row, col = key + except ValueError: + raise ValueError("only tuples of length 2 can be alignment indices") + if isinstance(row, int): + raise NotImplementedError + if isinstance(row, slice): + if row.indices(len(self)) != (0, 2, 1): + raise NotImplementedError + if isinstance(col, int): + raise NotImplementedError + if isinstance(col, slice): + n, m = self.shape + start_index, stop_index, step = col.indices(m) + if step != 1: + raise NotImplementedError + path = [] + index = 0 + path_iterator = iter(self.path) + starts = next(path_iterator) + for ends in path_iterator: + index += max(e - s for s, e in zip(starts, ends)) + if start_index < index: + offset = index - start_index + point = tuple( + e - offset if s < e else s for s, e in zip(starts, ends) + ) + path.append(point) + break + starts = ends + while True: + if stop_index <= index: + offset = index - stop_index + point = tuple( + e - offset if s < e else s for s, e in zip(starts, ends) + ) + path.append(point) + break + path.append(ends) + starts = ends + ends = next(path_iterator) + index += max(e - s for s, e in zip(starts, ends)) + path = tuple(path) + target = self.target + query = self.query + if path == self.path: + score = self.score + else: + score = None + return PairwiseAlignment(target, query, path, score) + raise TypeError("second index must be an integer or slice") + raise TypeError("first index must be an integer or slice") + raise TypeError("alignment indices must be integers, slices, or tuples") + + def _convert_sequence_string(self, sequence): + if isinstance(sequence, (bytes, bytearray)): + return sequence.decode() + if isinstance(sequence, str): + return sequence + if isinstance(sequence, Seq): + return str(sequence) + try: # check if target is a SeqRecord + sequence = sequence.seq + except AttributeError: + pass + else: + return str(sequence) + try: + view = memoryview(sequence) + except TypeError: + pass + else: + if view.format == "c": + return str(sequence) + return None + + def __format__(self, format_spec): + return self.format(format_spec) + + def format(self, fmt="", **kwargs): + """Return the alignment as a string in the specified file format. + + Arguments: + - fmt - File format. Acceptable values are + "" : create a human-readable representation of the + alignment (default); + "BED": create a line representing the alignment in + the Browser Extensible Data (BED) file format; + "PSL": create a line representing the alignment in + the Pattern Space Layout (PSL) file format as + generated by BLAT; + "SAM": create a line representing the alignment in + the Sequence Alignment/Map (SAM) format. + - mask - PSL format only. Specify if repeat regions in the target + sequence are masked and should be reported in the + `repMatches` field of the PSL file instead of in the + `matches` field. Acceptable values are + None : no masking (default); + "lower": masking by lower-case characters; + "upper": masking by upper-case characters. + - wildcard - PSL format only. Report alignments to the wildcard + character in the target or query sequence in the + `nCount` field of the PSL file instead of in the + `matches`, `misMatches`, or `repMatches` fields. + Default value is 'N'. + """ + if fmt == "": + return self._format_pretty(**kwargs) + elif fmt == "psl": + return self._format_psl(**kwargs) + elif fmt == "bed": + return self._format_bed(**kwargs) + elif fmt == "sam": + return self._format_sam(**kwargs) + else: + raise ValueError("Unknown format %s" % fmt) + + def _format_pretty(self): + seq1 = self._convert_sequence_string(self.target) + if seq1 is None: + return self._format_generalized() + seq2 = self._convert_sequence_string(self.query) + if seq2 is None: + return self._format_generalized() + n1 = len(seq1) + n2 = len(seq2) + aligned_seq1 = "" + aligned_seq2 = "" + pattern = "" + path = self.path + if path[0][1] > path[-1][1]: # mapped to reverse strand + path = tuple((c1, n2 - c2) for (c1, c2) in path) + seq2 = reverse_complement(seq2) + end1, end2 = path[0] + if end1 > 0 or end2 > 0: + end = max(end1, end2) + aligned_seq1 += " " * (end - end1) + seq1[:end1] + aligned_seq2 += " " * (end - end2) + seq2[:end2] + pattern += " " * end + start1 = end1 + start2 = end2 + for end1, end2 in path[1:]: + if end1 == start1: + gap = end2 - start2 + aligned_seq1 += "-" * gap + aligned_seq2 += seq2[start2:end2] + pattern += "-" * gap + elif end2 == start2: + gap = end1 - start1 + aligned_seq1 += seq1[start1:end1] + aligned_seq2 += "-" * gap + pattern += "-" * gap + else: + s1 = seq1[start1:end1] + s2 = seq2[start2:end2] + aligned_seq1 += s1 + aligned_seq2 += s2 + for c1, c2 in zip(s1, s2): + if c1 == c2: + pattern += "|" + else: + pattern += "." + start1 = end1 + start2 = end2 + aligned_seq1 += seq1[end1:] + aligned_seq2 += seq2[end2:] + return "%s\n%s\n%s\n" % (aligned_seq1, pattern, aligned_seq2) + + def _format_generalized(self): + seq1 = self.target + seq2 = self.query + aligned_seq1 = [] + aligned_seq2 = [] + pattern = [] + path = self.path + end1, end2 = path[0] + if end1 > 0 or end2 > 0: + if end1 <= end2: + for c2 in seq2[: end2 - end1]: + s2 = str(c2) + s1 = " " * len(s2) + aligned_seq1.append(s1) + aligned_seq2.append(s2) + pattern.append(s1) + else: # end1 > end2 + for c1 in seq1[: end1 - end2]: + s1 = str(c1) + s2 = " " * len(s1) + aligned_seq1.append(s1) + aligned_seq2.append(s2) + pattern.append(s2) + start1 = end1 + start2 = end2 + for end1, end2 in path[1:]: + if end1 == start1: + for c2 in seq2[start2:end2]: + s2 = str(c2) + s1 = "-" * len(s2) + aligned_seq1.append(s1) + aligned_seq2.append(s2) + pattern.append(s1) + start2 = end2 + elif end2 == start2: + for c1 in seq1[start1:end1]: + s1 = str(c1) + s2 = "-" * len(s1) + aligned_seq1.append(s1) + aligned_seq2.append(s2) + pattern.append(s2) + start1 = end1 + else: + for c1, c2 in zip(seq1[start1:end1], seq2[start2:end2]): + s1 = str(c1) + s2 = str(c2) + m1 = len(s1) + m2 = len(s2) + if c1 == c2: + p = "|" + else: + p = "." + if m1 < m2: + space = (m2 - m1) * " " + s1 += space + pattern.append(p * m1 + space) + elif m1 > m2: + space = (m1 - m2) * " " + s2 += space + pattern.append(p * m2 + space) + else: + pattern.append(p * m1) + aligned_seq1.append(s1) + aligned_seq2.append(s2) + start1 = end1 + start2 = end2 + aligned_seq1 = " ".join(aligned_seq1) + aligned_seq2 = " ".join(aligned_seq2) + pattern = " ".join(pattern) + return "%s\n%s\n%s\n" % (aligned_seq1, pattern, aligned_seq2) + + def _format_bed(self): + query = self.query + target = self.target + # variable names follow those in the BED file format specification + try: + chrom = target.id + except AttributeError: + chrom = "target" + try: + name = query.id + except AttributeError: + name = "query" + path = self.path + if path[0][1] < path[-1][1]: # mapped to forward strand + strand = "+" + else: # mapped to reverse strand + strand = "-" + n2 = len(query) + path = tuple((c1, n2 - c2) for (c1, c2) in path) + score = self.score + blockSizes = [] + tStarts = [] + tStart, qStart = path[0] + for tEnd, qEnd in path[1:]: + tCount = tEnd - tStart + qCount = qEnd - qStart + if tCount == 0: + qStart = qEnd + elif qCount == 0: + tStart = tEnd + else: + assert tCount == qCount + tStarts.append(tStart) + blockSizes.append(tCount) + tStart = tEnd + qStart = qEnd + chromStart = tStarts[0] + chromEnd = tStarts[-1] + blockSizes[-1] + blockStarts = [tStart - chromStart for tStart in tStarts] + blockCount = len(blockSizes) + blockSizes = ",".join(map(str, blockSizes)) + "," + blockStarts = ",".join(map(str, blockStarts)) + "," + thickStart = chromStart + thickEnd = chromEnd + itemRgb = "0" + words = [ + chrom, + str(chromStart), + str(chromEnd), + name, + str(score), + strand, + str(thickStart), + str(thickEnd), + itemRgb, + str(blockCount), + blockSizes, + blockStarts, + ] + line = "\t".join(words) + "\n" + return line + + def _format_psl(self, mask=False, wildcard="N"): + path = self.path + if not path: # alignment consists of gaps only + return "" + query = self.query + target = self.target + try: + qName = query.id + except AttributeError: + qName = "query" + try: + query = query.seq + except AttributeError: + pass + try: + tName = target.id + except AttributeError: + tName = "target" + try: + target = target.seq + except AttributeError: + pass + n1 = len(target) + n2 = len(query) + try: + seq1 = bytes(target) + except TypeError: # string + seq1 = bytes(target, "ASCII") + except UndefinedSequenceError: # sequence contents is unknown + seq1 = None + if path[0][1] < path[-1][1]: # mapped to forward strand + strand = "+" + seq2 = query + else: # mapped to reverse strand + strand = "-" + seq2 = reverse_complement(query) + path = tuple((c1, n2 - c2) for (c1, c2) in path) + try: + seq2 = bytes(seq2) + except TypeError: # string + seq2 = bytes(seq2, "ASCII") + except UndefinedSequenceError: # sequence contents is unknown + seq2 = None + if wildcard is not None: + if mask == "upper": + wildcard = ord(wildcard.lower()) + else: + wildcard = ord(wildcard.upper()) + # variable names follow those in the PSL file format specification + matches = 0 + misMatches = 0 + repMatches = 0 + nCount = 0 + qNumInsert = 0 + qBaseInsert = 0 + tNumInsert = 0 + tBaseInsert = 0 + qSize = n2 + tSize = n1 + blockSizes = [] + qStarts = [] + tStarts = [] + tStart, qStart = path[0] + for tEnd, qEnd in path[1:]: + tCount = tEnd - tStart + qCount = qEnd - qStart + if tCount == 0: + if qStart > 0 and qEnd < qSize: + qNumInsert += 1 + qBaseInsert += qCount + qStart = qEnd + elif qCount == 0: + if tStart > 0 and tEnd < tSize: + tNumInsert += 1 + tBaseInsert += tCount + tStart = tEnd + else: + assert tCount == qCount + tStarts.append(tStart) + qStarts.append(qStart) + blockSizes.append(tCount) + if seq1 is None or seq2 is None: + # contents of at least one sequence is unknown; + # count all alignments as matches: + matches += tCount + else: + s1 = seq1[tStart:tEnd] + s2 = seq2[qStart:qEnd] + if mask == "lower": + for u1, u2, c1 in zip(s1.upper(), s2.upper(), s1): + if u1 == wildcard or u2 == wildcard: + nCount += 1 + elif u1 == u2: + if u1 == c1: + matches += 1 + else: + repMatches += 1 + else: + misMatches += 1 + elif mask == "upper": + for u1, u2, c1 in zip(s1.lower(), s2.lower(), s1): + if u1 == wildcard or u2 == wildcard: + nCount += 1 + elif u1 == u2: + if u1 == c1: + matches += 1 + else: + repMatches += 1 + else: + misMatches += 1 + else: + for u1, u2 in zip(s1.upper(), s2.upper()): + if u1 == wildcard or u2 == wildcard: + nCount += 1 + elif u1 == u2: + matches += 1 + else: + misMatches += 1 + tStart = tEnd + qStart = qEnd + tStart = tStarts[0] # start of alignment in target + qStart = qStarts[0] # start of alignment in query + tEnd = tStarts[-1] + blockSizes[-1] # end of alignment in target + qEnd = qStarts[-1] + blockSizes[-1] # end of alignment in query + if strand == "-": + qStart, qEnd = qSize - qEnd, qSize - qStart + blockCount = len(blockSizes) + blockSizes = ",".join(map(str, blockSizes)) + "," + qStarts = ",".join(map(str, qStarts)) + "," + tStarts = ",".join(map(str, tStarts)) + "," + words = [ + str(matches), + str(misMatches), + str(repMatches), + str(nCount), + str(qNumInsert), + str(qBaseInsert), + str(tNumInsert), + str(tBaseInsert), + strand, + qName, + str(qSize), + str(qStart), + str(qEnd), + tName, + str(tSize), + str(tStart), + str(tEnd), + str(blockCount), + blockSizes, + qStarts, + tStarts, + ] + line = "\t".join(words) + "\n" + return line + + def _format_sam(self): + query = self.query + target = self.target + try: + qName = query.id + except AttributeError: + qName = "query" + else: + query = query.seq + try: + rName = target.id + except AttributeError: + rName = "target" + else: + target = target.seq + n1 = len(target) + n2 = len(query) + pos = None + qSize = n2 + tSize = n1 + cigar = [] + path = self.path + if path[0][1] < path[-1][1]: # mapped to forward strand + flag = 0 + seq = query + else: # mapped to reverse strand + flag = 16 + seq = reverse_complement(query) + path = tuple((c1, n2 - c2) for (c1, c2) in path) + try: + seq = bytes(seq) + except TypeError: # string + pass + else: + seq = str(seq, "ASCII") + tStart, qStart = path[0] + for tEnd, qEnd in path[1:]: + tCount = tEnd - tStart + qCount = qEnd - qStart + if tCount == 0: + length = qCount + if pos is None or tEnd == tSize: + operation = "S" + else: + operation = "I" + qStart = qEnd + elif qCount == 0: + if tStart > 0 and tEnd < tSize: + length = tCount + operation = "D" + else: + operation = None + tStart = tEnd + else: + assert tCount == qCount + if pos is None: + pos = tStart + tStart = tEnd + qStart = qEnd + operation = "M" + length = tCount + if operation is not None: + cigar.append(str(length) + operation) + mapQ = 255 # not available + rNext = "*" + pNext = 0 + tLen = 0 + qual = "*" + cigar = "".join(cigar) + tag = "AS:i:%d" % int(round(self.score)) + words = [ + qName, + str(flag), + rName, + str(pos + 1), # 1-based coordinates + str(mapQ), + cigar, + rNext, + str(pNext), + str(tLen), + seq, + qual, + tag, + ] + line = "\t".join(words) + "\n" + return line + + def __str__(self): + return self.format() + + def __len__(self): + """Return the number of sequences in the alignment, which is always 2.""" + return 2 + + @property + def shape(self): + """Return the shape of the alignment as a tuple of two integer values. + + The first integer value is the number of sequences in the alignment as + returned by len(alignment), which is always 2 for pairwise alignments. + + The second integer value is the number of columns in the alignment when + it is printed, and is equal to the sum of the number of matches, number + of mismatches, and the total length of gaps in the target and query. + Sequence sections beyond the aligned segment are not included in the + number of columns. + + For example, + + >>> from Bio import Align + >>> aligner = Align.PairwiseAligner() + >>> aligner.mode = "global" + >>> alignments = aligner.align("GACCTG", "CGATCG") + >>> alignment = alignments[0] + >>> print(alignment) + -GACCT-G + -||--|-| + CGA--TCG + + >>> len(alignment) + 2 + >>> alignment.shape + (2, 8) + >>> aligner.mode = "local" + >>> alignments = aligner.align("GACCTG", "CGATCG") + >>> alignment = alignments[0] + >>> print(alignment) + GACCT-G + ||--|-| + CGA--TCG + + >>> len(alignment) + 2 + >>> alignment.shape + (2, 7) + """ + path = self.path + if path[0][1] > path[-1][1]: # mapped to reverse strand + n2 = len(self.query) + path = tuple((c1, n2 - c2) for (c1, c2) in path) + start = path[0] + n = len(start) + m = 0 + for end in path[1:]: + m += max(e - s for s, e in zip(start, end)) + start = end + return (n, m) + + @property + def aligned(self): + """Return the indices of subsequences aligned to each other. + + This property returns the start and end indices of subsequences + in the target and query sequence that were aligned to each other. + If the alignment between target (t) and query (q) consists of N + chunks, you get two tuples of length N: + + (((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)), + ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN))) + + For example, + + >>> from Bio import Align + >>> aligner = Align.PairwiseAligner() + >>> alignments = aligner.align("GAACT", "GAT") + >>> alignment = alignments[0] + >>> print(alignment) + GAACT + ||--| + GA--T + + >>> alignment.aligned + (((0, 2), (4, 5)), ((0, 2), (2, 3))) + >>> alignment = alignments[1] + >>> print(alignment) + GAACT + |-|-| + G-A-T + + >>> alignment.aligned + (((0, 1), (2, 3), (4, 5)), ((0, 1), (1, 2), (2, 3))) + + Note that different alignments may have the same subsequences + aligned to each other. In particular, this may occur if alignments + differ from each other in terms of their gap placement only: + + >>> aligner.mismatch_score = -10 + >>> alignments = aligner.align("AAACAAA", "AAAGAAA") + >>> len(alignments) + 2 + >>> print(alignments[0]) + AAAC-AAA + |||--||| + AAA-GAAA + + >>> alignments[0].aligned + (((0, 3), (4, 7)), ((0, 3), (4, 7))) + >>> print(alignments[1]) + AAA-CAAA + |||--||| + AAAG-AAA + + >>> alignments[1].aligned + (((0, 3), (4, 7)), ((0, 3), (4, 7))) + + The property can be used to identify alignments that are identical + to each other in terms of their aligned sequences. + """ + segments1 = [] + segments2 = [] + path = self.path + if path[0][1] < path[-1][1]: # mapped to forward strand + i1, i2 = path[0] + for node in path[1:]: + j1, j2 = node + if j1 > i1 and j2 > i2: + segment1 = (i1, j1) + segment2 = (i2, j2) + segments1.append(segment1) + segments2.append(segment2) + i1, i2 = j1, j2 + else: # mapped to reverse strand + n2 = len(self.query) + i1, i2 = path[0] + i2 = n2 - i2 + for node in path[1:]: + j1, j2 = node + j2 = n2 - j2 + if j1 > i1 and j2 > i2: + segment1 = (i1, j1) + segment2 = (n2 - i2, n2 - j2) + segments1.append(segment1) + segments2.append(segment2) + i1, i2 = j1, j2 + return tuple(segments1), tuple(segments2) + + def sort(self, key=None, reverse=False): + """Sort the sequences of the alignment in place. + + By default, this sorts the sequences alphabetically using their id + attribute if available, or by their sequence contents otherwise. + For example, + + >>> from Bio.Align import PairwiseAligner + >>> aligner = PairwiseAligner() + >>> aligner.gap_score = -1 + >>> alignments = aligner.align("AATAA", "AAGAA") + >>> len(alignments) + 1 + >>> alignment = alignments[0] + >>> print(alignment) + AATAA + ||.|| + AAGAA + + >>> alignment.sort() + >>> print(alignment) + AAGAA + ||.|| + AATAA + + + Alternatively, a key function can be supplied that maps each sequence + to a sort value. For example, you could sort on the GC content of each + sequence. + + >>> from Bio.SeqUtils import GC + >>> alignment.sort(key=GC) + >>> print(alignment) + AATAA + ||.|| + AAGAA + + + You can reverse the sort order by passing `reverse=True`: + + >>> alignment.sort(key=GC, reverse=True) + >>> print(alignment) + AAGAA + ||.|| + AATAA + + + The sequences are now sorted by decreasing GC content value. + """ + path = self.path + sequences = self.target, self.query + if key is None: + try: + values = [sequence.id for sequence in sequences] + except AttributeError: + values = sequences + else: + values = [key(sequence) for sequence in sequences] + indices = sorted(range(len(sequences)), key=values.__getitem__, reverse=reverse) + sequences = [sequences[index] for index in indices] + self.target, self.query = sequences + path = tuple(tuple(row[index] for index in indices) for row in path) + self.path = path + + def map(self, alignment): + r"""Map the alignment to self.target and return the resulting alignment. + + Here, self.query and alignment.target are the same sequence. + + A typical example is where self is the pairwise alignment between a + chromosome and a transcript, the argument is the pairwise alignment + between the transcript and a sequence (e.g., as obtained by RNA-seq), + and we want to find the alignment of the sequence to the chromosome: + + >>> from Bio import Align + >>> aligner = Align.PairwiseAligner() + >>> aligner.mode = 'local' + >>> aligner.open_gap_score = -1 + >>> aligner.extend_gap_score = 0 + >>> chromosome = "AAAAAAAACCCCCCCAAAAAAAAAAAGGGGGGAAAAAAAA" + >>> transcript = "CCCCCCCGGGGGG" + >>> alignments1 = aligner.align(chromosome, transcript) + >>> len(alignments1) + 1 + >>> alignment1 = alignments1[0] + >>> print(alignment1) + AAAAAAAACCCCCCCAAAAAAAAAAAGGGGGGAAAAAAAA + |||||||-----------|||||| + CCCCCCC-----------GGGGGG + + >>> sequence = "CCCCGGGG" + >>> alignments2 = aligner.align(transcript, sequence) + >>> len(alignments2) + 1 + >>> alignment2 = alignments2[0] + >>> print(alignment2) + CCCCCCCGGGGGG + |||||||| + CCCCGGGG + + >>> alignment = alignment1.map(alignment2) + >>> print(alignment) + AAAAAAAACCCCCCCAAAAAAAAAAAGGGGGGAAAAAAAA + ||||-----------|||| + CCCC-----------GGGG + + >>> format(alignment, "psl") + '8\t0\t0\t0\t0\t0\t1\t11\t+\tquery\t8\t0\t8\ttarget\t40\t11\t30\t2\t4,4,\t0,4,\t11,26,\n' + + Mapping the alignment does not depend on the sequence contents. If we + delete the sequence contents, the same alignment is found in PSL format + (though we obviously lose the ability to print the sequence alignment): + + >>> alignment1.target = Seq(None, len(alignment1.target)) + >>> alignment1.query = Seq(None, len(alignment1.query)) + >>> alignment2.target = Seq(None, len(alignment2.target)) + >>> alignment2.query = Seq(None, len(alignment2.query)) + >>> alignment = alignment1.map(alignment2) + >>> format(alignment, "psl") + '8\t0\t0\t0\t0\t0\t1\t11\t+\tquery\t8\t0\t8\ttarget\t40\t11\t30\t2\t4,4,\t0,4,\t11,26,\n' + """ + from numpy import array + + alignment1, alignment2 = self, alignment + if len(alignment1.query) != len(alignment2.target): + raise ValueError( + "length of alignment1 query sequence (%d) != length of alignment2 target sequence (%d)" + % (len(alignment1.query), len(alignment2.target)) + ) + target = alignment1.target + query = alignment2.query + path1 = alignment1.path + path2 = alignment2.path + n1 = len(alignment1.query) + n2 = len(alignment2.query) + if path1[0][1] < path1[-1][1]: # mapped to forward strand + strand1 = "+" + else: # mapped to reverse strand + strand1 = "-" + if path2[0][1] < path2[-1][1]: # mapped to forward strand + strand2 = "+" + else: # mapped to reverse strand + strand2 = "-" + path1 = array(path1) + path2 = array(path2) + if strand1 == "+": + if strand2 == "-": # mapped to reverse strand + path2[:, 1] = n2 - path2[:, 1] + else: # mapped to reverse strand + path1[:, 1] = n1 - path1[:, 1] + path2[:, 0] = n1 - path2[::-1, 0] + if strand2 == "+": + path2[:, 1] = n2 - path2[::-1, 1] + else: # mapped to reverse strand + path2[:, 1] = path2[::-1, 1] + path = [] + tEnd, qEnd = sys.maxsize, sys.maxsize + path1 = iter(path1) + tStart1, qStart1 = sys.maxsize, sys.maxsize + for tEnd1, qEnd1 in path1: + if tStart1 < tEnd1 and qStart1 < qEnd1: + break + tStart1, qStart1 = tEnd1, qEnd1 + tStart2, qStart2 = sys.maxsize, sys.maxsize + for tEnd2, qEnd2 in path2: + while qStart2 < qEnd2 and tStart2 < tEnd2: + while True: + if tStart2 < qStart1: + if tEnd2 < qStart1: + size = tEnd2 - tStart2 + else: + size = qStart1 - tStart2 + break + elif tStart2 < qEnd1: + offset = tStart2 - qStart1 + if tEnd2 > qEnd1: + size = qEnd1 - tStart2 + else: + size = tEnd2 - tStart2 + qStart = qStart2 + tStart = tStart1 + offset + if tStart > tEnd and qStart > qEnd: + # adding a gap both in target and in query; + # add gap to target first: + path.append([tStart, qEnd]) + qEnd = qStart2 + size + tEnd = tStart + size + path.append([tStart, qStart]) + path.append([tEnd, qEnd]) + break + tStart1, qStart1 = sys.maxsize, sys.maxsize + for tEnd1, qEnd1 in path1: + if tStart1 < tEnd1 and qStart1 < qEnd1: + break + tStart1, qStart1 = tEnd1, qEnd1 + else: + size = qEnd2 - qStart2 + break + qStart2 += size + tStart2 += size + tStart2, qStart2 = tEnd2, qEnd2 + if strand1 != strand2: + path = tuple((c1, n2 - c2) for (c1, c2) in path) + alignment = PairwiseAlignment(target, query, path, None) + return alignment + + @property + def substitutions(self): + """Return an Array with the number of substitutions of letters in the alignment. + + As an example, consider a sequence alignment of two RNA sequences: + + >>> from Bio.Align import PairwiseAligner + >>> target = "ATACTTACCTGGCAGGGGAGATACCATGATCACGAAGGTGGTTTTCCCAGGGCGAGGCTTATCCATTGCACTCCGGATGTGCTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGCATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTTTCCCCTG" # human spliceosomal small nuclear RNA U1 + >>> query = "ATACTTACCTGACAGGGGAGGCACCATGATCACACAGGTGGTCCTCCCAGGGCGAGGCTCTTCCATTGCACTGCGGGAGGGTTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGTATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTATCCCCCG" # sea lamprey spliceosomal small RNA U1 + >>> aligner = PairwiseAligner() + >>> aligner.gap_score = -10 + >>> alignments = aligner.align(target, query) + >>> len(alignments) + 1 + >>> alignment = alignments[0] + >>> print(alignment) + ATACTTACCTGGCAGGGGAGATACCATGATCACGAAGGTGGTTTTCCCAGGGCGAGGCTTATCCATTGCACTCCGGATGTGCTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGCATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTTTCCCCTG + |||||||||||.||||||||..|||||||||||..|||||||..|||||||||||||||..|||||||||||.|||..|.|.|||||||||||||||||||||||||||||||||||||||.||||||||||||||||||||||||||||||||||.|||||.| + ATACTTACCTGACAGGGGAGGCACCATGATCACACAGGTGGTCCTCCCAGGGCGAGGCTCTTCCATTGCACTGCGGGAGGGTTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGTATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTATCCCCCG + + >>> m = alignment.substitutions + >>> print(m) + A C G T + A 28.0 1.0 2.0 1.0 + C 0.0 39.0 1.0 2.0 + G 2.0 0.0 45.0 0.0 + T 2.0 5.0 1.0 35.0 + + + Note that the matrix is not symmetric: rows correspond to the target + sequence, and columns to the query sequence. For example, the number + of T's in the target sequence that are aligned to a C in the query + sequence is + + >>> m['T', 'C'] + 5.0 + + and the number of C's in the query sequence tat are aligned to a T in + the query sequence is + + >>> m['C', 'T'] + 2.0 + + For some applications (for example, to define a scoring matrix from + the substitution matrix), a symmetric matrix may be preferred, which + can be calculated as follows: + + >>> m += m.transpose() + >>> m /= 2.0 + >>> print(m) + A C G T + A 28.0 0.5 2.0 1.5 + C 0.5 39.0 0.5 3.5 + G 2.0 0.5 45.0 0.5 + T 1.5 3.5 0.5 35.0 + + + The matrix is now symmetric, with counts divided equally on both sides + of the diagonal: + + >>> m['C', 'T'] + 3.5 + >>> m['T', 'C'] + 3.5 + + The total number of substitutions between T's and C's in the alignment + is 3.5 + 3.5 = 7. + """ + target = self.target + try: + target = target.seq + except AttributeError: + pass + query = self.query + try: + query = query.seq + except AttributeError: + pass + sequences = (str(target), str(query)) + letters = set.union(*[set(sequence) for sequence in sequences]) + letters = "".join(sorted(letters)) + m = substitution_matrices.Array(letters, dims=2) + n = len(sequences) + for i1 in range(n): + path1 = [p[i1] for p in self.path] + sequence1 = sequences[i1] + for i2 in range(i1 + 1, n): + path2 = [p[i2] for p in self.path] + sequence2 = sequences[i2] + start1, start2 = sys.maxsize, sys.maxsize + for end1, end2 in zip(path1, path2): + if start1 < end1 and start2 < end2: # aligned + segment1 = sequence1[start1:end1] + segment2 = sequence2[start2:end2] + for c1, c2 in zip(segment1, segment2): + m[c1, c2] += 1.0 + start1, start2 = end1, end2 + return m + + +class PairwiseAlignments: + """Implements an iterator over pairwise alignments returned by the aligner. + + This class also supports indexing, which is fast for increasing indices, + but may be slow for random access of a large number of alignments. + + Note that pairwise aligners can return an astronomical number of alignments, + even for relatively short sequences, if they align poorly to each other. We + therefore recommend to first check the number of alignments, accessible as + len(alignments), which can be calculated quickly even if the number of + alignments is very large. + """ + + def __init__(self, seqA, seqB, score, paths): + """Initialize a new PairwiseAlignments object. + + Arguments: + - seqA - The first sequence, as a plain string, without gaps. + - seqB - The second sequence, as a plain string, without gaps. + - score - The alignment score. + - paths - An iterator over the paths in the traceback matrix; + each path defines one alignment. + + You would normally obtain an PairwiseAlignments object by calling + aligner.align(seqA, seqB), where aligner is a PairwiseAligner object. + """ + self.seqA = seqA + self.seqB = seqB + self.score = score + self.paths = paths + self.index = -1 + + def __len__(self): + return len(self.paths) + + def __getitem__(self, index): + if index == self.index: + return self.alignment + if index < self.index: + self.paths.reset() + self.index = -1 + while self.index < index: + try: + alignment = next(self) + except StopIteration: + raise IndexError("index out of range") from None + return alignment + + def __iter__(self): + self.paths.reset() + self.index = -1 + return self + + def __next__(self): + path = next(self.paths) + self.index += 1 + alignment = PairwiseAlignment(self.seqA, self.seqB, path, self.score) + self.alignment = alignment + return alignment + + +class PairwiseAligner(_aligners.PairwiseAligner): + """Performs pairwise sequence alignment using dynamic programming. + + This provides functions to get global and local alignments between two + sequences. A global alignment finds the best concordance between all + characters in two sequences. A local alignment finds just the + subsequences that align the best. + + To perform a pairwise sequence alignment, first create a PairwiseAligner + object. This object stores the match and mismatch scores, as well as the + gap scores. Typically, match scores are positive, while mismatch scores + and gap scores are negative or zero. By default, the match score is 1, + and the mismatch and gap scores are zero. Based on the values of the gap + scores, a PairwiseAligner object automatically chooses the appropriate + alignment algorithm (the Needleman-Wunsch, Smith-Waterman, Gotoh, or + Waterman-Smith-Beyer global or local alignment algorithm). + + Calling the "score" method on the aligner with two sequences as arguments + will calculate the alignment score between the two sequences. + Calling the "align" method on the aligner with two sequences as arguments + will return a generator yielding the alignments between the two + sequences. + + Some examples: + + >>> from Bio import Align + >>> aligner = Align.PairwiseAligner() + >>> alignments = aligner.align("TACCG", "ACG") + >>> for alignment in sorted(alignments): + ... print("Score = %.1f:" % alignment.score) + ... print(alignment) + ... + Score = 3.0: + TACCG + -|-|| + -A-CG + + Score = 3.0: + TACCG + -||-| + -AC-G + + + Specify the aligner mode as local to generate local alignments: + + >>> aligner.mode = 'local' + >>> alignments = aligner.align("TACCG", "ACG") + >>> for alignment in sorted(alignments): + ... print("Score = %.1f:" % alignment.score) + ... print(alignment) + ... + Score = 3.0: + TACCG + |-|| + A-CG + + Score = 3.0: + TACCG + ||-| + AC-G + + + Do a global alignment. Identical characters are given 2 points, + 1 point is deducted for each non-identical character. + + >>> aligner.mode = 'global' + >>> aligner.match_score = 2 + >>> aligner.mismatch_score = -1 + >>> for alignment in aligner.align("TACCG", "ACG"): + ... print("Score = %.1f:" % alignment.score) + ... print(alignment) + ... + Score = 6.0: + TACCG + -||-| + -AC-G + + Score = 6.0: + TACCG + -|-|| + -A-CG + + + Same as above, except now 0.5 points are deducted when opening a + gap, and 0.1 points are deducted when extending it. + + >>> aligner.open_gap_score = -0.5 + >>> aligner.extend_gap_score = -0.1 + >>> aligner.target_end_gap_score = 0.0 + >>> aligner.query_end_gap_score = 0.0 + >>> for alignment in aligner.align("TACCG", "ACG"): + ... print("Score = %.1f:" % alignment.score) + ... print(alignment) + ... + Score = 5.5: + TACCG + -|-|| + -A-CG + + Score = 5.5: + TACCG + -||-| + -AC-G + + + The alignment function can also use known matrices already included in + Biopython: + + >>> from Bio.Align import substitution_matrices + >>> aligner = Align.PairwiseAligner() + >>> aligner.substitution_matrix = substitution_matrices.load("BLOSUM62") + >>> alignments = aligner.align("KEVLA", "EVL") + >>> alignments = list(alignments) + >>> print("Number of alignments: %d" % len(alignments)) + Number of alignments: 1 + >>> alignment = alignments[0] + >>> print("Score = %.1f" % alignment.score) + Score = 13.0 + >>> print(alignment) + KEVLA + -|||- + -EVL- + + + You can also set the value of attributes directly during construction + of the PairwiseAligner object by providing them as keyword arguemnts: + + >>> aligner = Align.PairwiseAligner(mode='global', match_score=2, mismatch_score=-1) + >>> for alignment in aligner.align("TACCG", "ACG"): + ... print("Score = %.1f:" % alignment.score) + ... print(alignment) + ... + Score = 6.0: + TACCG + -||-| + -AC-G + + Score = 6.0: + TACCG + -|-|| + -A-CG + + + """ + + def __init__(self, **kwargs): + """Initialize a new PairwiseAligner with the keyword arguments as attributes. + + Loops over the keyword arguments and sets them as attributes on the object. + """ + super().__init__() + for name, value in kwargs.items(): + setattr(self, name, value) + + def __setattr__(self, key, value): + if key not in dir(_aligners.PairwiseAligner): + # To prevent confusion, don't allow users to create new attributes. + # On CPython, __slots__ can be used for this, but currently + # __slots__ does not behave the same way on PyPy at least. + raise AttributeError("'PairwiseAligner' object has no attribute '%s'" % key) + _aligners.PairwiseAligner.__setattr__(self, key, value) + + def align(self, seqA, seqB, strand="+"): + """Return the alignments of two sequences using PairwiseAligner.""" + if isinstance(seqA, (Seq, MutableSeq)): + sA = bytes(seqA) + else: + sA = seqA + if strand == "+": + sB = seqB + else: # strand == "-": + sB = reverse_complement(seqB) + if isinstance(sB, (Seq, MutableSeq)): + sB = bytes(sB) + score, paths = _aligners.PairwiseAligner.align(self, sA, sB, strand) + alignments = PairwiseAlignments(seqA, seqB, score, paths) + return alignments + + def score(self, seqA, seqB, strand="+"): + """Return the alignments score of two sequences using PairwiseAligner.""" + if isinstance(seqA, (Seq, MutableSeq)): + seqA = bytes(seqA) + if strand == "-": + seqB = reverse_complement(seqB) + if isinstance(seqB, (Seq, MutableSeq)): + seqB = bytes(seqB) + return _aligners.PairwiseAligner.score(self, seqA, seqB, strand) + + def __getstate__(self): + state = { + "wildcard": self.wildcard, + "target_internal_open_gap_score": self.target_internal_open_gap_score, + "target_internal_extend_gap_score": self.target_internal_extend_gap_score, + "target_left_open_gap_score": self.target_left_open_gap_score, + "target_left_extend_gap_score": self.target_left_extend_gap_score, + "target_right_open_gap_score": self.target_right_open_gap_score, + "target_right_extend_gap_score": self.target_right_extend_gap_score, + "query_internal_open_gap_score": self.query_internal_open_gap_score, + "query_internal_extend_gap_score": self.query_internal_extend_gap_score, + "query_left_open_gap_score": self.query_left_open_gap_score, + "query_left_extend_gap_score": self.query_left_extend_gap_score, + "query_right_open_gap_score": self.query_right_open_gap_score, + "query_right_extend_gap_score": self.query_right_extend_gap_score, + "mode": self.mode, + } + if self.substitution_matrix is None: + state["match_score"] = self.match_score + state["mismatch_score"] = self.mismatch_score + else: + state["substitution_matrix"] = self.substitution_matrix + return state + + def __setstate__(self, state): + self.wildcard = state["wildcard"] + self.target_internal_open_gap_score = state["target_internal_open_gap_score"] + self.target_internal_extend_gap_score = state[ + "target_internal_extend_gap_score" + ] + self.target_left_open_gap_score = state["target_left_open_gap_score"] + self.target_left_extend_gap_score = state["target_left_extend_gap_score"] + self.target_right_open_gap_score = state["target_right_open_gap_score"] + self.target_right_extend_gap_score = state["target_right_extend_gap_score"] + self.query_internal_open_gap_score = state["query_internal_open_gap_score"] + self.query_internal_extend_gap_score = state["query_internal_extend_gap_score"] + self.query_left_open_gap_score = state["query_left_open_gap_score"] + self.query_left_extend_gap_score = state["query_left_extend_gap_score"] + self.query_right_open_gap_score = state["query_right_open_gap_score"] + self.query_right_extend_gap_score = state["query_right_extend_gap_score"] + self.mode = state["mode"] + substitution_matrix = state.get("substitution_matrix") + if substitution_matrix is None: + self.match_score = state["match_score"] + self.mismatch_score = state["mismatch_score"] + else: + self.substitution_matrix = substitution_matrix + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Align/__pycache__/AlignInfo.cpython-37.pyc b/code/lib/Bio/Align/__pycache__/AlignInfo.cpython-37.pyc new file mode 100644 index 0000000..7955c2f Binary files /dev/null and b/code/lib/Bio/Align/__pycache__/AlignInfo.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Align/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..54c2237 Binary files /dev/null and b/code/lib/Bio/Align/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/_aligners.c b/code/lib/Bio/Align/_aligners.c new file mode 100644 index 0000000..e78f252 --- /dev/null +++ b/code/lib/Bio/Align/_aligners.c @@ -0,0 +1,6988 @@ +/* Copyright 2018-2019 by Michiel de Hoon. All rights reserved. + * This file is part of the Biopython distribution and governed by your + * choice of the "Biopython License Agreement" or the "BSD 3-Clause License". + * Please see the LICENSE file that should have been included as part of this + * package. + */ + + + +#define PY_SSIZE_T_CLEAN +#include "Python.h" +#include "float.h" + + +#define HORIZONTAL 0x1 +#define VERTICAL 0x2 +#define DIAGONAL 0x4 +#define STARTPOINT 0x8 +#define ENDPOINT 0x10 +#define M_MATRIX 0x1 +#define Ix_MATRIX 0x2 +#define Iy_MATRIX 0x4 +#define DONE 0x3 +#define NONE 0x7 + +#define OVERFLOW_ERROR -1 +#define MEMORY_ERROR -2 + +#define MISSING_LETTER -1 + +#define SAFE_ADD(t, s) \ +{ if (s != OVERFLOW_ERROR) { \ + term = t; \ + if (term > PY_SSIZE_T_MAX - s) s = OVERFLOW_ERROR; \ + else s += term; \ + } \ +} + + +typedef enum {NeedlemanWunschSmithWaterman, + Gotoh, + WatermanSmithBeyer, + Unknown} Algorithm; + +typedef enum {Global, Local} Mode; + +typedef struct { + unsigned char trace : 5; + unsigned char path : 3; +} Trace; + +typedef struct { + unsigned char Ix : 4; + unsigned char Iy : 4; +} TraceGapsGotoh; + +typedef struct { + int* MIx; + int* IyIx; + int* MIy; + int* IxIy; +} TraceGapsWatermanSmithBeyer; + +typedef struct { + PyObject_HEAD + Trace** M; + union { TraceGapsGotoh** gotoh; + TraceGapsWatermanSmithBeyer** waterman_smith_beyer; } gaps; + int nA; + int nB; + int iA; + int iB; + Mode mode; + Algorithm algorithm; + Py_ssize_t length; + unsigned char strand; +} PathGenerator; + +static PyObject* +PathGenerator_create_path(PathGenerator* self, int i, int j) { + PyObject* tuple; + PyObject* row; + PyObject* value; + int path; + const int ii = i; + const int jj = j; + int n = 1; + int direction = 0; + Trace** M = self->M; + const unsigned char strand = self->strand; + + while (1) { + path = M[i][j].path; + if (!path) break; + if (path != direction) { + n++; + direction = path; + } + switch (path) { + case HORIZONTAL: j++; break; + case VERTICAL: i++; break; + case DIAGONAL: i++; j++; break; + } + } + + i = ii; + j = jj; + direction = 0; + tuple = PyTuple_New(n); + if (!tuple) return NULL; + + n = 0; + switch (strand) { + case '+': + while (1) { + path = M[i][j].path; + if (path != direction) { + row = PyTuple_New(2); + if (!row) break; + value = PyLong_FromLong(i); + if (!value) { + Py_DECREF(row); /* all references were stolen */ + break; + } + PyTuple_SET_ITEM(row, 0, value); + value = PyLong_FromLong(j); + if (!value) { + Py_DECREF(row); /* all references were stolen */ + break; + } + PyTuple_SET_ITEM(row, 1, value); + PyTuple_SET_ITEM(tuple, n, row); + n++; + direction = path; + } + switch (path) { + case HORIZONTAL: j++; break; + case VERTICAL: i++; break; + case DIAGONAL: i++; j++; break; + default: return tuple; + } + } + break; + case '-': { + const int nB = self->nB; + while (1) { + path = M[i][j].path; + if (path != direction) { + row = PyTuple_New(2); + if (!row) break; + value = PyLong_FromLong(i); + if (!value) { + Py_DECREF(row); /* all references were stolen */ + break; + } + PyTuple_SET_ITEM(row, 0, value); + value = PyLong_FromLong(nB-j); + if (!value) { + Py_DECREF(row); /* all references were stolen */ + break; + } + PyTuple_SET_ITEM(row, 1, value); + PyTuple_SET_ITEM(tuple, n, row); + n++; + direction = path; + } + switch (path) { + case HORIZONTAL: j++; break; + case VERTICAL: i++; break; + case DIAGONAL: i++; j++; break; + default: return tuple; + } + } + break; + } + } + Py_DECREF(tuple); /* all references were stolen */ + return PyErr_NoMemory(); +} + +static Py_ssize_t +PathGenerator_needlemanwunsch_length(PathGenerator* self) +{ + int i; + int j; + int trace; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + Py_ssize_t term; + Py_ssize_t count = MEMORY_ERROR; + Py_ssize_t temp; + Py_ssize_t* counts; + counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!counts) goto exit; + counts[0] = 1; + for (j = 1; j <= nB; j++) { + trace = M[0][j].trace; + count = 0; + if (trace & HORIZONTAL) SAFE_ADD(counts[j-1], count); + counts[j] = count; + } + for (i = 1; i <= nA; i++) { + trace = M[i][0].trace; + count = 0; + if (trace & VERTICAL) SAFE_ADD(counts[0], count); + temp = counts[0]; + counts[0] = count; + for (j = 1; j <= nB; j++) { + trace = M[i][j].trace; + count = 0; + if (trace & HORIZONTAL) SAFE_ADD(counts[j-1], count); + if (trace & VERTICAL) SAFE_ADD(counts[j], count); + if (trace & DIAGONAL) SAFE_ADD(temp, count); + temp = counts[j]; + counts[j] = count; + } + } + PyMem_Free(counts); +exit: + return count; +} + +static Py_ssize_t +PathGenerator_smithwaterman_length(PathGenerator* self) +{ + int i; + int j; + int trace; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + Py_ssize_t term; + Py_ssize_t count = MEMORY_ERROR; + Py_ssize_t total = 0; + Py_ssize_t temp; + Py_ssize_t* counts; + counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!counts) goto exit; + counts[0] = 1; + for (j = 1; j <= nB; j++) counts[j] = 1; + for (i = 1; i <= nA; i++) { + temp = counts[0]; + counts[0] = 1; + for (j = 1; j <= nB; j++) { + trace = M[i][j].trace; + count = 0; + if (trace & DIAGONAL) SAFE_ADD(temp, count); + if (M[i][j].trace & ENDPOINT) SAFE_ADD(count, total); + if (trace & HORIZONTAL) SAFE_ADD(counts[j-1], count); + if (trace & VERTICAL) SAFE_ADD(counts[j], count); + temp = counts[j]; + if (count == 0 && (trace & STARTPOINT)) count = 1; + counts[j] = count; + } + } + count = total; + PyMem_Free(counts); +exit: + return count; +} + +static Py_ssize_t +PathGenerator_gotoh_global_length(PathGenerator* self) +{ + int i; + int j; + int trace; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + TraceGapsGotoh** gaps = self->gaps.gotoh; + Py_ssize_t count = MEMORY_ERROR; + Py_ssize_t term; + Py_ssize_t M_temp; + Py_ssize_t Ix_temp; + Py_ssize_t Iy_temp; + Py_ssize_t* M_counts = NULL; + Py_ssize_t* Ix_counts = NULL; + Py_ssize_t* Iy_counts = NULL; + M_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!M_counts) goto exit; + Ix_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!Ix_counts) goto exit; + Iy_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!Iy_counts) goto exit; + M_counts[0] = 1; + Ix_counts[0] = 0; + Iy_counts[0] = 0; + for (j = 1; j <= nB; j++) { + M_counts[j] = 0; + Ix_counts[j] = 0; + Iy_counts[j] = 1; + } + for (i = 1; i <= nA; i++) { + M_temp = M_counts[0]; + M_counts[0] = 0; + Ix_temp = Ix_counts[0]; + Ix_counts[0] = 1; + Iy_temp = Iy_counts[0]; + Iy_counts[0] = 0; + for (j = 1; j <= nB; j++) { + count = 0; + trace = M[i][j].trace; + if (trace & M_MATRIX) SAFE_ADD(M_temp, count); + if (trace & Ix_MATRIX) SAFE_ADD(Ix_temp, count); + if (trace & Iy_MATRIX) SAFE_ADD(Iy_temp, count); + M_temp = M_counts[j]; + M_counts[j] = count; + count = 0; + trace = gaps[i][j].Ix; + if (trace & M_MATRIX) SAFE_ADD(M_temp, count); + if (trace & Ix_MATRIX) SAFE_ADD(Ix_counts[j], count); + if (trace & Iy_MATRIX) SAFE_ADD(Iy_counts[j], count); + Ix_temp = Ix_counts[j]; + Ix_counts[j] = count; + count = 0; + trace = gaps[i][j].Iy; + if (trace & M_MATRIX) SAFE_ADD(M_counts[j-1], count); + if (trace & Ix_MATRIX) SAFE_ADD(Ix_counts[j-1], count); + if (trace & Iy_MATRIX) SAFE_ADD(Iy_counts[j-1], count); + Iy_temp = Iy_counts[j]; + Iy_counts[j] = count; + } + } + count = 0; + if (M[nA][nB].trace) SAFE_ADD(M_counts[nB], count); + if (gaps[nA][nB].Ix) SAFE_ADD(Ix_counts[nB], count); + if (gaps[nA][nB].Iy) SAFE_ADD(Iy_counts[nB], count); +exit: + if (M_counts) PyMem_Free(M_counts); + if (Ix_counts) PyMem_Free(Ix_counts); + if (Iy_counts) PyMem_Free(Iy_counts); + return count; +} + +static Py_ssize_t +PathGenerator_gotoh_local_length(PathGenerator* self) +{ + int i; + int j; + int trace; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + TraceGapsGotoh** gaps = self->gaps.gotoh; + Py_ssize_t term; + Py_ssize_t count = MEMORY_ERROR; + Py_ssize_t total = 0; + Py_ssize_t M_temp; + Py_ssize_t Ix_temp; + Py_ssize_t Iy_temp; + Py_ssize_t* M_counts = NULL; + Py_ssize_t* Ix_counts = NULL; + Py_ssize_t* Iy_counts = NULL; + M_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!M_counts) goto exit; + Ix_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!Ix_counts) goto exit; + Iy_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!Iy_counts) goto exit; + M_counts[0] = 1; + Ix_counts[0] = 0; + Iy_counts[0] = 0; + for (j = 1; j <= nB; j++) { + M_counts[j] = 1; + Ix_counts[j] = 0; + Iy_counts[j] = 0; + } + for (i = 1; i <= nA; i++) { + M_temp = M_counts[0]; + M_counts[0] = 1; + Ix_temp = Ix_counts[0]; + Ix_counts[0] = 0; + Iy_temp = Iy_counts[0]; + Iy_counts[0] = 0; + for (j = 1; j <= nB; j++) { + count = 0; + trace = M[i][j].trace; + if (trace & M_MATRIX) SAFE_ADD(M_temp, count); + if (trace & Ix_MATRIX) SAFE_ADD(Ix_temp, count); + if (trace & Iy_MATRIX) SAFE_ADD(Iy_temp, count); + if (count == 0 && (trace & STARTPOINT)) count = 1; + M_temp = M_counts[j]; + M_counts[j] = count; + if (M[i][j].trace & ENDPOINT) SAFE_ADD(count, total); + count = 0; + trace = gaps[i][j].Ix; + if (trace & M_MATRIX) SAFE_ADD(M_temp, count); + if (trace & Ix_MATRIX) SAFE_ADD(Ix_counts[j], count); + if (trace & Iy_MATRIX) SAFE_ADD(Iy_counts[j], count); + Ix_temp = Ix_counts[j]; + Ix_counts[j] = count; + count = 0; + trace = gaps[i][j].Iy; + if (trace & M_MATRIX) SAFE_ADD(M_counts[j-1], count); + if (trace & Ix_MATRIX) SAFE_ADD(Ix_counts[j-1], count); + if (trace & Iy_MATRIX) SAFE_ADD(Iy_counts[j-1], count); + Iy_temp = Iy_counts[j]; + Iy_counts[j] = count; + } + } + count = total; +exit: + if (M_counts) PyMem_Free(M_counts); + if (Ix_counts) PyMem_Free(Ix_counts); + if (Iy_counts) PyMem_Free(Iy_counts); + return count; +} + +static Py_ssize_t +PathGenerator_waterman_smith_beyer_global_length(PathGenerator* self) +{ + int i; + int j; + int trace; + int* p; + int gap; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer; + Py_ssize_t count = MEMORY_ERROR; + Py_ssize_t term; + Py_ssize_t** M_count = NULL; + Py_ssize_t** Ix_count = NULL; + Py_ssize_t** Iy_count = NULL; + M_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*)); + if (!M_count) goto exit; + Ix_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*)); + if (!Ix_count) goto exit; + Iy_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*)); + if (!Iy_count) goto exit; + for (i = 0; i <= nA; i++) { + M_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!M_count[i]) goto exit; + Ix_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!Ix_count[i]) goto exit; + Iy_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!Iy_count[i]) goto exit; + } + for (i = 0; i <= nA; i++) { + for (j = 0; j <= nB; j++) { + count = 0; + trace = M[i][j].trace; + if (trace & M_MATRIX) SAFE_ADD(M_count[i-1][j-1], count); + if (trace & Ix_MATRIX) SAFE_ADD(Ix_count[i-1][j-1], count); + if (trace & Iy_MATRIX) SAFE_ADD(Iy_count[i-1][j-1], count); + if (count == 0) count = 1; /* happens at M[0][0] only */ + M_count[i][j] = count; + count = 0; + p = gaps[i][j].MIx; + if (p) { + while (1) { + gap = *p; + if (!gap) break; + SAFE_ADD(M_count[i-gap][j], count); + p++; + } + } + p = gaps[i][j].IyIx; + if (p) { + while (1) { + gap = *p; + if (!gap) break; + SAFE_ADD(Iy_count[i-gap][j], count); + p++; + } + } + Ix_count[i][j] = count; + count = 0; + p = gaps[i][j].MIy; + if (p) { + while (1) { + gap = *p; + if (!gap) break; + SAFE_ADD(M_count[i][j-gap], count); + p++; + } + } + p = gaps[i][j].IxIy; + if (p) { + while (1) { + gap = *p; + if (!gap) break; + SAFE_ADD(Ix_count[i][j-gap], count); + p++; + } + } + Iy_count[i][j] = count; + } + } + count = 0; + if (M[nA][nB].trace) + SAFE_ADD(M_count[nA][nB], count); + if (gaps[nA][nB].MIx[0] || gaps[nA][nB].IyIx[0]) + SAFE_ADD(Ix_count[nA][nB], count); + if (gaps[nA][nB].MIy[0] || gaps[nA][nB].IxIy[0]) + SAFE_ADD(Iy_count[nA][nB], count); +exit: + if (M_count) { + if (Ix_count) { + if (Iy_count) { + for (i = 0; i <= nA; i++) { + if (!M_count[i]) break; + PyMem_Free(M_count[i]); + if (!Ix_count[i]) break; + PyMem_Free(Ix_count[i]); + if (!Iy_count[i]) break; + PyMem_Free(Iy_count[i]); + } + PyMem_Free(Iy_count); + } + PyMem_Free(Ix_count); + } + PyMem_Free(M_count); + } + return count; +} + +static Py_ssize_t +PathGenerator_waterman_smith_beyer_local_length(PathGenerator* self) +{ + int i; + int j; + int trace; + int* p; + int gap; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer; + Py_ssize_t term; + Py_ssize_t count = MEMORY_ERROR; + Py_ssize_t total = 0; + Py_ssize_t** M_count = NULL; + Py_ssize_t** Ix_count = NULL; + Py_ssize_t** Iy_count = NULL; + M_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*)); + if (!M_count) goto exit; + Ix_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*)); + if (!Ix_count) goto exit; + Iy_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*)); + if (!Iy_count) goto exit; + for (i = 0; i <= nA; i++) { + M_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!M_count[i]) goto exit; + Ix_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!Ix_count[i]) goto exit; + Iy_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t)); + if (!Iy_count[i]) goto exit; + } + for (i = 0; i <= nA; i++) { + for (j = 0; j <= nB; j++) { + count = 0; + trace = M[i][j].trace; + if (trace & M_MATRIX) SAFE_ADD(M_count[i-1][j-1], count); + if (trace & Ix_MATRIX) SAFE_ADD(Ix_count[i-1][j-1], count); + if (trace & Iy_MATRIX) SAFE_ADD(Iy_count[i-1][j-1], count); + if (count == 0 && (trace & STARTPOINT)) count = 1; + M_count[i][j] = count; + if (M[i][j].trace & ENDPOINT) SAFE_ADD(count, total); + count = 0; + p = gaps[i][j].MIx; + if (p) { + while (1) { + gap = *p; + if (!gap) break; + SAFE_ADD(M_count[i-gap][j], count); + p++; + } + } + p = gaps[i][j].IyIx; + if (p) { + while (1) { + gap = *p; + if (!gap) break; + SAFE_ADD(Iy_count[i-gap][j], count); + p++; + } + } + Ix_count[i][j] = count; + count = 0; + p = gaps[i][j].MIy; + if (p) { + while (1) { + gap = *p; + if (!gap) break; + SAFE_ADD(M_count[i][j-gap], count); + p++; + } + } + p = gaps[i][j].IxIy; + if (p) { + while (1) { + gap = *p; + if (!gap) break; + SAFE_ADD(Ix_count[i][j-gap], count); + p++; + } + } + Iy_count[i][j] = count; + } + } + count = total; +exit: + if (M_count) { + if (Ix_count) { + if (Iy_count) { + for (i = 0; i <= nA; i++) { + if (!M_count[i]) break; + PyMem_Free(M_count[i]); + if (!Ix_count[i]) break; + PyMem_Free(Ix_count[i]); + if (!Iy_count[i]) break; + PyMem_Free(Iy_count[i]); + } + PyMem_Free(Iy_count); + } + PyMem_Free(Ix_count); + } + PyMem_Free(M_count); + } + return count; +} + +static Py_ssize_t PathGenerator_length(PathGenerator* self) { + Py_ssize_t length = self->length; + if (length == 0) { + switch (self->algorithm) { + case NeedlemanWunschSmithWaterman: + switch (self->mode) { + case Global: + length = PathGenerator_needlemanwunsch_length(self); + break; + case Local: + length = PathGenerator_smithwaterman_length(self); + break; + default: + /* should not happen, but some compilers complain that + * that length can be used uninitialized. + */ + PyErr_SetString(PyExc_RuntimeError, "Unknown mode"); + return -1; + } + break; + case Gotoh: + switch (self->mode) { + case Global: + length = PathGenerator_gotoh_global_length(self); + break; + case Local: + length = PathGenerator_gotoh_local_length(self); + break; + default: + /* should not happen, but some compilers complain that + * that length can be used uninitialized. + */ + PyErr_SetString(PyExc_RuntimeError, "Unknown mode"); + return -1; + } + break; + case WatermanSmithBeyer: + switch (self->mode) { + case Global: + length = PathGenerator_waterman_smith_beyer_global_length(self); + break; + case Local: + length = PathGenerator_waterman_smith_beyer_local_length(self); + break; + default: + /* should not happen, but some compilers complain that + * that length can be used uninitialized. + */ + PyErr_SetString(PyExc_RuntimeError, "Unknown mode"); + return -1; + } + break; + case Unknown: + default: + PyErr_SetString(PyExc_RuntimeError, "Unknown algorithm"); + return -1; + } + self->length = length; + } + switch (length) { + case OVERFLOW_ERROR: + PyErr_Format(PyExc_OverflowError, + "number of optimal alignments is larger than %zd", + PY_SSIZE_T_MAX); + break; + case MEMORY_ERROR: + PyErr_SetNone(PyExc_MemoryError); + break; + default: + break; + } + return length; +} + +static void +PathGenerator_dealloc(PathGenerator* self) +{ + int i; + const int nA = self->nA; + const Algorithm algorithm = self->algorithm; + Trace** M = self->M; + if (M) { + for (i = 0; i <= nA; i++) { + if (!M[i]) break; + PyMem_Free(M[i]); + } + PyMem_Free(M); + } + switch (algorithm) { + case NeedlemanWunschSmithWaterman: + break; + case Gotoh: { + TraceGapsGotoh** gaps = self->gaps.gotoh; + if (gaps) { + for (i = 0; i <= nA; i++) { + if (!gaps[i]) break; + PyMem_Free(gaps[i]); + } + PyMem_Free(gaps); + } + break; + } + case WatermanSmithBeyer: { + TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer; + if (gaps) { + int j; + const int nB = self->nB; + int* trace; + for (i = 0; i <= nA; i++) { + if (!gaps[i]) break; + for (j = 0; j <= nB; j++) { + trace = gaps[i][j].MIx; + if (trace) PyMem_Free(trace); + trace = gaps[i][j].IyIx; + if (trace) PyMem_Free(trace); + trace = gaps[i][j].MIy; + if (trace) PyMem_Free(trace); + trace = gaps[i][j].IxIy; + if (trace) PyMem_Free(trace); + } + PyMem_Free(gaps[i]); + } + PyMem_Free(gaps); + } + break; + } + case Unknown: + default: + PyErr_WriteUnraisable((PyObject*)self); + break; + } + Py_TYPE(self)->tp_free((PyObject*)self); +} + +static PyObject* PathGenerator_next_needlemanwunsch(PathGenerator* self) +{ + int i = 0; + int j = 0; + int path; + int trace = 0; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + + path = M[i][j].path; + if (path == DONE) return NULL; + if (path == 0) { + /* Generate the first path. */ + i = nA; + j = nB; + } + else { + /* We already have a path. Prune the path to see if there are + * any alternative paths. */ + while (1) { + if (path == HORIZONTAL) { + trace = M[i][++j].trace; + if (trace & VERTICAL) { + M[--i][j].path = VERTICAL; + break; + } + if (trace & DIAGONAL) { + M[--i][--j].path = DIAGONAL; + break; + } + } + else if (path == VERTICAL) { + trace = M[++i][j].trace; + if (trace & DIAGONAL) { + M[--i][--j].path = DIAGONAL; + break; + } + } + else /* DIAGONAL */ { + i++; + j++; + } + path = M[i][j].path; + if (!path) { + /* we reached the end of the alignment without finding + * an alternative path */ + M[0][0].path = DONE; + return NULL; + } + } + } + /* Follow the traceback until we reach the origin. */ + while (1) { + trace = M[i][j].trace; + if (trace & HORIZONTAL) M[i][--j].path = HORIZONTAL; + else if (trace & VERTICAL) M[--i][j].path = VERTICAL; + else if (trace & DIAGONAL) M[--i][--j].path = DIAGONAL; + else break; + } + return PathGenerator_create_path(self, 0, 0); +} + +static PyObject* PathGenerator_next_smithwaterman(PathGenerator* self) +{ + int trace = 0; + int i = self->iA; + int j = self->iB; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + int path = M[0][0].path; + + if (path == DONE || path == NONE) return NULL; + + path = M[i][j].path; + if (path) { + /* We already have a path. Prune the path to see if there are + * any alternative paths. */ + while (1) { + if (path == HORIZONTAL) { + trace = M[i][++j].trace; + if (trace & VERTICAL) { + M[--i][j].path = VERTICAL; + break; + } + else if (trace & DIAGONAL) { + M[--i][--j].path = DIAGONAL; + break; + } + } + else if (path == VERTICAL) { + trace = M[++i][j].trace; + if (trace & DIAGONAL) { + M[--i][--j].path = DIAGONAL; + break; + } + } + else /* DIAGONAL */ { + i++; + j++; + } + path = M[i][j].path; + if (!path) break; + } + } + + if (path) { + trace = M[i][j].trace; + } else { + /* Find a suitable end point for a path. + * Only allow end points ending at the M matrix. */ + while (1) { + if (j < nB) j++; + else if (i < nA) { + i++; + j = 0; + } + else { + /* we reached the end of the sequences without finding + * an alternative path */ + M[0][0].path = DONE; + return NULL; + } + trace = M[i][j].trace; + if (trace & ENDPOINT) { + trace &= DIAGONAL; /* exclude paths ending in a gap */ + break; + } + } + M[i][j].path = 0; + } + + /* Follow the traceback until we reach the origin. */ + while (1) { + if (trace & HORIZONTAL) M[i][--j].path = HORIZONTAL; + else if (trace & VERTICAL) M[--i][j].path = VERTICAL; + else if (trace & DIAGONAL) M[--i][--j].path = DIAGONAL; + else if (trace & STARTPOINT) { + self->iA = i; + self->iB = j; + return PathGenerator_create_path(self, i, j); + } + else { + PyErr_SetString(PyExc_RuntimeError, + "Unexpected trace in PathGenerator_next_smithwaterman"); + return NULL; + } + trace = M[i][j].trace; + } +} + +static PyObject* PathGenerator_next_gotoh_global(PathGenerator* self) +{ + int i = 0; + int j = 0; + int m; + int path; + int trace = 0; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + TraceGapsGotoh** gaps = self->gaps.gotoh; + + m = M_MATRIX; + path = M[i][j].path; + if (path == DONE) return NULL; + if (path == 0) { + i = nA; + j = nB; + } + else { + /* We already have a path. Prune the path to see if there are + * any alternative paths. */ + while (1) { + path = M[i][j].path; + if (path == 0) { + switch (m) { + case M_MATRIX: m = Ix_MATRIX; break; + case Ix_MATRIX: m = Iy_MATRIX; break; + case Iy_MATRIX: m = 0; break; + } + break; + } + switch (path) { + case HORIZONTAL: trace = gaps[i][++j].Iy; break; + case VERTICAL: trace = gaps[++i][j].Ix; break; + case DIAGONAL: trace = M[++i][++j].trace; break; + } + switch (m) { + case M_MATRIX: + if (trace & Ix_MATRIX) { + m = Ix_MATRIX; + break; + } + case Ix_MATRIX: + if (trace & Iy_MATRIX) { + m = Iy_MATRIX; + break; + } + case Iy_MATRIX: + default: + switch (path) { + case HORIZONTAL: m = Iy_MATRIX; break; + case VERTICAL: m = Ix_MATRIX; break; + case DIAGONAL: m = M_MATRIX; break; + } + continue; + } + switch (path) { + case HORIZONTAL: j--; break; + case VERTICAL: i--; break; + case DIAGONAL: i--; j--; break; + } + M[i][j].path = path; + break; + } + } + + if (path == 0) { + /* Generate a new path. */ + switch (m) { + case M_MATRIX: + if (M[nA][nB].trace) { + /* m = M_MATRIX; */ + break; + } + case Ix_MATRIX: + if (gaps[nA][nB].Ix) { + m = Ix_MATRIX; + break; + } + case Iy_MATRIX: + if (gaps[nA][nB].Iy) { + m = Iy_MATRIX; + break; + } + default: + /* exhausted this generator */ + M[0][0].path = DONE; + return NULL; + } + } + + switch (m) { + case M_MATRIX: + trace = M[i][j].trace; + path = DIAGONAL; + i--; j--; + break; + case Ix_MATRIX: + trace = gaps[i][j].Ix; + path = VERTICAL; + i--; + break; + case Iy_MATRIX: + trace = gaps[i][j].Iy; + path = HORIZONTAL; + j--; + break; + } + + while (1) { + if (trace & M_MATRIX) { + trace = M[i][j].trace; + M[i][j].path = path; + path = DIAGONAL; + i--; j--; + } + else if (trace & Ix_MATRIX) { + M[i][j].path = path; + trace = gaps[i][j].Ix; + path = VERTICAL; + i--; + } + else if (trace & Iy_MATRIX) { + M[i][j].path = path; + trace = gaps[i][j].Iy; + path = HORIZONTAL; + j--; + } + else break; + } + return PathGenerator_create_path(self, 0, 0); +} + +static PyObject* PathGenerator_next_gotoh_local(PathGenerator* self) +{ + int trace = 0; + int i; + int j; + int m = M_MATRIX; + int iA = self->iA; + int iB = self->iB; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + TraceGapsGotoh** gaps = self->gaps.gotoh; + int path = M[0][0].path; + + if (path == DONE) return NULL; + + path = M[iA][iB].path; + + if (path) { + i = iA; + j = iB; + while (1) { + /* We already have a path. Prune the path to see if there are + * any alternative paths. */ + path = M[i][j].path; + if (path == 0) { + m = M_MATRIX; + iA = i; + iB = j; + break; + } + switch (path) { + case HORIZONTAL: trace = gaps[i][++j].Iy; break; + case VERTICAL: trace = gaps[++i][j].Ix; break; + case DIAGONAL: trace = M[++i][++j].trace; break; + } + switch (m) { + case M_MATRIX: + if (trace & Ix_MATRIX) { + m = Ix_MATRIX; + break; + } + case Ix_MATRIX: + if (trace & Iy_MATRIX) { + m = Iy_MATRIX; + break; + } + case Iy_MATRIX: + default: + switch (path) { + case HORIZONTAL: m = Iy_MATRIX; break; + case VERTICAL: m = Ix_MATRIX; break; + case DIAGONAL: m = M_MATRIX; break; + } + continue; + } + switch (path) { + case HORIZONTAL: j--; break; + case VERTICAL: i--; break; + case DIAGONAL: i--; j--; break; + } + M[i][j].path = path; + break; + } + } + + if (path == 0) { + /* Find the end point for a new path. */ + while (1) { + if (iB < nB) iB++; + else if (iA < nA) { + iA++; + iB = 0; + } + else { + /* we reached the end of the alignment without finding + * an alternative path */ + M[0][0].path = DONE; + return NULL; + } + if (M[iA][iB].trace & ENDPOINT) { + M[iA][iB].path = 0; + break; + } + } + m = M_MATRIX; + i = iA; + j = iB; + } + + while (1) { + switch (m) { + case M_MATRIX: trace = M[i][j].trace; break; + case Ix_MATRIX: trace = gaps[i][j].Ix; break; + case Iy_MATRIX: trace = gaps[i][j].Iy; break; + } + if (trace == STARTPOINT) { + self->iA = i; + self->iB = j; + return PathGenerator_create_path(self, i, j); + } + switch (m) { + case M_MATRIX: + path = DIAGONAL; + i--; + j--; + break; + case Ix_MATRIX: + path = VERTICAL; + i--; + break; + case Iy_MATRIX: + path = HORIZONTAL; + j--; + break; + } + if (trace & M_MATRIX) m = M_MATRIX; + else if (trace & Ix_MATRIX) m = Ix_MATRIX; + else if (trace & Iy_MATRIX) m = Iy_MATRIX; + else { + PyErr_SetString(PyExc_RuntimeError, + "Unexpected trace in PathGenerator_next_gotoh_local"); + return NULL; + } + M[i][j].path = path; + } + return NULL; +} + +static PyObject* +PathGenerator_next_waterman_smith_beyer_global(PathGenerator* self) +{ + int i = 0, j = 0; + int iA, iB; + int trace; + int* gapM; + int* gapXY; + + int m = M_MATRIX; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer; + + int gap; + int path = M[0][0].path; + + if (path == DONE) return NULL; + + if (path) { + /* We already have a path. Prune the path to see if there are + * any alternative paths. */ + while (1) { + if (!path) { + m <<= 1; + break; + } + switch (path) { + case HORIZONTAL: + iA = i; + iB = j; + while (M[i][iB].path == HORIZONTAL) iB++; + break; + case VERTICAL: + iA = i; + while (M[iA][j].path == VERTICAL) iA++; + iB = j; + break; + case DIAGONAL: + iA = i + 1; + iB = j + 1; + break; + default: + PyErr_SetString(PyExc_RuntimeError, + "Unexpected path in PathGenerator_next_waterman_smith_beyer_global"); + return NULL; + } + if (i == iA) { /* HORIZONTAL */ + gapM = gaps[iA][iB].MIy; + gapXY = gaps[iA][iB].IxIy; + if (m == M_MATRIX) { + gap = iB - j; + while (*gapM != gap) gapM++; + gapM++; + gap = *gapM; + if (gap) { + j = iB - gap; + while (j < iB) M[i][--iB].path = HORIZONTAL; + break; + } + } else if (m == Ix_MATRIX) { + gap = iB - j; + while (*gapXY != gap) gapXY++; + gapXY++; + } + gap = *gapXY; + if (gap) { + m = Ix_MATRIX; + j = iB - gap; + while (j < iB) M[i][--iB].path = HORIZONTAL; + break; + } + /* no alternative found; continue pruning */ + m = Iy_MATRIX; + j = iB; + } + else if (j == iB) { /* VERTICAL */ + gapM = gaps[iA][iB].MIx; + gapXY = gaps[iA][iB].IyIx; + if (m == M_MATRIX) { + gap = iA - i; + while (*gapM != gap) gapM++; + gapM++; + gap = *gapM; + if (gap) { + i = iA - gap; + while (i < iA) M[--iA][j].path = VERTICAL; + break; + } + } else if (m == Iy_MATRIX) { + gap = iA - i; + while (*gapXY != gap) gapXY++; + gapXY++; + } + gap = *gapXY; + if (gap) { + m = Iy_MATRIX; + i = iA - gap; + while (i < iA) M[--iA][j].path = VERTICAL; + break; + } + /* no alternative found; continue pruning */ + m = Ix_MATRIX; + i = iA; + } + else { /* DIAGONAL */ + i = iA - 1; + j = iB - 1; + trace = M[iA][iB].trace; + switch (m) { + case M_MATRIX: + if (trace & Ix_MATRIX) { + m = Ix_MATRIX; + M[i][j].path = DIAGONAL; + break; + } + case Ix_MATRIX: + if (trace & Iy_MATRIX) { + m = Iy_MATRIX; + M[i][j].path = DIAGONAL; + break; + } + case Iy_MATRIX: + default: + /* no alternative found; continue pruning */ + m = M_MATRIX; + i = iA; + j = iB; + path = M[i][j].path; + continue; + } + /* alternative found; build path until starting point */ + break; + } + path = M[i][j].path; + } + } + + if (!path) { + /* Find a suitable end point for a path. */ + switch (m) { + case M_MATRIX: + if (M[nA][nB].trace) { + /* m = M_MATRIX; */ + break; + } + case Ix_MATRIX: + if (gaps[nA][nB].MIx[0] || gaps[nA][nB].IyIx[0]) { + m = Ix_MATRIX; + break; + } + case Iy_MATRIX: + if (gaps[nA][nB].MIy[0] || gaps[nA][nB].IxIy[0]) { + m = Iy_MATRIX; + break; + } + default: + M[0][0].path = DONE; + return NULL; + } + i = nA; + j = nB; + } + + /* Follow the traceback until we reach the origin. */ + while (1) { + switch (m) { + case M_MATRIX: + trace = M[i][j].trace; + if (trace & M_MATRIX) m = M_MATRIX; + else if (trace & Ix_MATRIX) m = Ix_MATRIX; + else if (trace & Iy_MATRIX) m = Iy_MATRIX; + else return PathGenerator_create_path(self, i, j); + i--; + j--; + M[i][j].path = DIAGONAL; + break; + case Ix_MATRIX: + gap = gaps[i][j].MIx[0]; + if (gap) m = M_MATRIX; + else { + gap = gaps[i][j].IyIx[0]; + m = Iy_MATRIX; + } + iA = i - gap; + while (iA < i) M[--i][j].path = VERTICAL; + M[i][j].path = VERTICAL; + break; + case Iy_MATRIX: + gap = gaps[i][j].MIy[0]; + if (gap) m = M_MATRIX; + else { + gap = gaps[i][j].IxIy[0]; + m = Ix_MATRIX; + } + iB = j - gap; + while (iB < j) M[i][--j].path = HORIZONTAL; + M[i][j].path = HORIZONTAL; + break; + } + } +} + +static PyObject* +PathGenerator_next_waterman_smith_beyer_local(PathGenerator* self) +{ + int i, j, m; + int trace = 0; + int* gapM; + int* gapXY; + + int iA = self->iA; + int iB = self->iB; + const int nA = self->nA; + const int nB = self->nB; + Trace** M = self->M; + TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer; + + int gap; + int path = M[0][0].path; + + if (path == DONE) return NULL; + m = 0; + path = M[iA][iB].path; + if (path) { + /* We already have a path. Prune the path to see if there are + * any alternative paths. */ + m = M_MATRIX; + i = iA; + j = iB; + while (1) { + path = M[i][j].path; + switch (path) { + case HORIZONTAL: + iA = i; + iB = j; + while (M[i][iB].path == HORIZONTAL) iB++; + break; + case VERTICAL: + iA = i; + iB = j; + while (M[iA][j].path == VERTICAL) iA++; + break; + case DIAGONAL: + iA = i + 1; + iB = j + 1; + break; + default: + iA = -1; + break; + } + if (iA < 0) { + m = 0; + iA = i; + iB = j; + break; + } + if (i == iA) { /* HORIZONTAL */ + gapM = gaps[iA][iB].MIy; + gapXY = gaps[iA][iB].IxIy; + if (m == M_MATRIX) { + gap = iB - j; + while (*gapM != gap) gapM++; + gapM++; + gap = *gapM; + if (gap) { + j = iB - gap; + while (j < iB) M[i][--iB].path = HORIZONTAL; + break; + } + } else if (m == Ix_MATRIX) { + gap = iB - j; + while (*gapXY != gap) gapXY++; + gapXY++; + } + gap = *gapXY; + if (gap) { + m = Ix_MATRIX; + j = iB - gap; + M[i][j].path = HORIZONTAL; + while (iB > j) M[i][--iB].path = HORIZONTAL; + break; + } + /* no alternative found; continue pruning */ + m = Iy_MATRIX; + j = iB; + } + else if (j == iB) { /* VERTICAL */ + gapM = gaps[iA][iB].MIx; + gapXY = gaps[iA][iB].IyIx; + if (m == M_MATRIX) { + gap = iA - i; + while (*gapM != gap) gapM++; + gapM++; + gap = *gapM; + if (gap) { + i = iA - gap; + while (i < iA) M[--iA][j].path = VERTICAL; + break; + } + } else if (m == Iy_MATRIX) { + gap = iA - i; + while (*gapXY != gap) gapXY++; + gapXY++; + } + gap = *gapXY; + if (gap) { + m = Iy_MATRIX; + i = iA - gap; + M[i][j].path = VERTICAL; + while (iA > i) M[--iA][j].path = VERTICAL; + break; + } + /* no alternative found; continue pruning */ + m = Ix_MATRIX; + i = iA; + } + else { /* DIAGONAL */ + i = iA - 1; + j = iB - 1; + trace = M[iA][iB].trace; + switch (m) { + case M_MATRIX: + if (trace & Ix_MATRIX) { + m = Ix_MATRIX; + M[i][j].path = DIAGONAL; + break; + } + case Ix_MATRIX: + if (trace & Iy_MATRIX) { + m = Iy_MATRIX; + M[i][j].path = DIAGONAL; + break; + } + case Iy_MATRIX: + default: + /* no alternative found; continue pruning */ + m = M_MATRIX; + i = iA; + j = iB; + continue; + } + /* alternative found; build path until starting point */ + break; + } + } + } + + if (m == 0) { + /* We are at [nA][nB]. Find a suitable end point for a path. */ + while (1) { + if (iB < nB) iB++; + else if (iA < nA) { + iA++; + iB = 0; + } + else { + /* exhausted this generator */ + M[0][0].path = DONE; + return NULL; + } + if (M[iA][iB].trace & ENDPOINT) break; + } + M[iA][iB].path = 0; + m = M_MATRIX; + i = iA; + j = iB; + } + + /* Follow the traceback until we reach the origin. */ + while (1) { + switch (m) { + case Ix_MATRIX: + gapM = gaps[i][j].MIx; + gapXY = gaps[i][j].IyIx; + iB = j; + gap = *gapM; + if (gap) m = M_MATRIX; + else { + gap = *gapXY; + m = Iy_MATRIX; + } + iA = i - gap; + while (i > iA) M[--i][iB].path = VERTICAL; + break; + case Iy_MATRIX: + gapM = gaps[i][j].MIy; + gapXY = gaps[i][j].IxIy; + iA = i; + gap = *gapM; + if (gap) m = M_MATRIX; + else { + gap = *gapXY; + m = Ix_MATRIX; + } + iB = j - gap; + while (j > iB) M[iA][--j].path = HORIZONTAL; + break; + case M_MATRIX: + iA = i-1; + iB = j-1; + trace = M[i][j].trace; + if (trace & M_MATRIX) m = M_MATRIX; + else if (trace & Ix_MATRIX) m = Ix_MATRIX; + else if (trace & Iy_MATRIX) m = Iy_MATRIX; + else if (trace == STARTPOINT) { + self->iA = i; + self->iB = j; + return PathGenerator_create_path(self, i, j); + } + else { + PyErr_SetString(PyExc_RuntimeError, + "Unexpected trace in PathGenerator_next_waterman_smith_beyer_local"); + return NULL; + } + M[iA][iB].path = DIAGONAL; + break; + } + i = iA; + j = iB; + } +} + +static PyObject * +PathGenerator_next(PathGenerator* self) +{ + const Mode mode = self->mode; + const Algorithm algorithm = self->algorithm; + switch (algorithm) { + case NeedlemanWunschSmithWaterman: + switch (mode) { + case Global: + return PathGenerator_next_needlemanwunsch(self); + case Local: + return PathGenerator_next_smithwaterman(self); + } + case Gotoh: + switch (mode) { + case Global: + return PathGenerator_next_gotoh_global(self); + case Local: + return PathGenerator_next_gotoh_local(self); + } + case WatermanSmithBeyer: + switch (mode) { + case Global: + return PathGenerator_next_waterman_smith_beyer_global(self); + case Local: + return PathGenerator_next_waterman_smith_beyer_local(self); + } + case Unknown: + default: + PyErr_SetString(PyExc_RuntimeError, "Unknown algorithm"); + return NULL; + } +} + +static const char PathGenerator_reset__doc__[] = "reset the iterator"; + +static PyObject* +PathGenerator_reset(PathGenerator* self) +{ + switch (self->mode) { + case Local: + self->iA = 0; + self->iB = 0; + case Global: { + Trace** M = self->M; + switch (self->algorithm) { + case NeedlemanWunschSmithWaterman: + case Gotoh: { + if (M[0][0].path != NONE) M[0][0].path = 0; + break; + } + case WatermanSmithBeyer: { + M[0][0].path = 0; + break; + } + case Unknown: + default: + break; + } + } + } + Py_INCREF(Py_None); + return Py_None; +} + +static PyMethodDef PathGenerator_methods[] = { + {"reset", + (PyCFunction)PathGenerator_reset, + METH_NOARGS, + PathGenerator_reset__doc__ + }, + {NULL} /* Sentinel */ +}; + +static PySequenceMethods PathGenerator_as_sequence = { + (lenfunc)PathGenerator_length, /* sq_length */ + NULL, /* sq_concat */ + NULL, /* sq_repeat */ + NULL, /* sq_item */ + NULL, /* sq_ass_item */ + NULL, /* sq_contains */ + NULL, /* sq_inplace_concat */ + NULL, /* sq_inplace_repeat */ +}; + +static PyTypeObject PathGenerator_Type = { + PyVarObject_HEAD_INIT(NULL, 0) + "Path generator", /* tp_name */ + sizeof(PathGenerator), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)PathGenerator_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_reserved */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + &PathGenerator_as_sequence, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)PathGenerator_next, /* tp_iternext */ + PathGenerator_methods, /* tp_methods */ +}; + +typedef struct { + PyObject_HEAD + Mode mode; + Algorithm algorithm; + double match; + double mismatch; + double epsilon; + double target_internal_open_gap_score; + double target_internal_extend_gap_score; + double target_left_open_gap_score; + double target_left_extend_gap_score; + double target_right_open_gap_score; + double target_right_extend_gap_score; + double query_internal_open_gap_score; + double query_internal_extend_gap_score; + double query_left_open_gap_score; + double query_left_extend_gap_score; + double query_right_open_gap_score; + double query_right_extend_gap_score; + PyObject* target_gap_function; + PyObject* query_gap_function; + Py_buffer substitution_matrix; + PyObject* alphabet; + int* mapping; + int wildcard; +} Aligner; + + +static Py_ssize_t +set_alphabet(Aligner* self, PyObject* alphabet) +{ + Py_ssize_t size; + if (alphabet == Py_None) { + if (self->alphabet) { + Py_DECREF(self->alphabet); + self->alphabet = NULL; + } + if (self->mapping) { + PyMem_Free(self->mapping); + self->mapping = NULL; + } + return 0; + } + else if (PyUnicode_Check(alphabet)) { + int* mapping; + int i; + int n; + int kind; + void* characters; + if (PyUnicode_READY(alphabet) == -1) return -1; + size = PyUnicode_GET_LENGTH(alphabet); + if (size == 0) { + PyErr_SetString(PyExc_ValueError, "alphabet has zero length"); + return -1; + } + kind = PyUnicode_KIND(alphabet); + switch (kind) { + case PyUnicode_1BYTE_KIND: { + n = 1 << 8 * sizeof(Py_UCS1); + break; + } + case PyUnicode_2BYTE_KIND: { + n = 1 << 8 * sizeof(Py_UCS2); + break; + } + case PyUnicode_4BYTE_KIND: { + n = 0x110000; /* Maximum code point in Unicode 6.0 + * is 0x10ffff = 1114111 */ + break; + } + case PyUnicode_WCHAR_KIND: + default: + PyErr_SetString(PyExc_ValueError, "could not interpret alphabet"); + return -1; + } + characters = PyUnicode_DATA(alphabet); + mapping = PyMem_Malloc(n*sizeof(int)); + if (!mapping) return -1; + for (i = 0; i < n; i++) mapping[i] = MISSING_LETTER; + for (i = 0; i < size; i++) { + Py_UCS4 character = PyUnicode_READ(kind, characters, i); + if (mapping[character] != MISSING_LETTER) { + PyObject* c = PyUnicode_FromKindAndData(kind, &character, 1); + PyErr_Format(PyExc_ValueError, + "alphabet contains '%S' more than once", c); + Py_XDECREF(c); + PyMem_Free(mapping); + return -1; + } + mapping[character] = i; + } + Py_INCREF(alphabet); + if (self->mapping) PyMem_Free(self->mapping); + self->mapping = mapping; + } + else { + /* alphabet is not a string; cannot use mapping */ + PyObject* sequence = PySequence_Fast(alphabet, + "alphabet should support the sequence protocol (e.g.,\n" + "strings, lists, and tuples can be valid alphabets)."); + if (!sequence) return -1; + size = PySequence_Fast_GET_SIZE(sequence); + Py_DECREF(sequence); + if (self->mapping) { + PyMem_Free(self->mapping); + self->mapping = NULL; + } + Py_INCREF(alphabet); + } + Py_XDECREF(self->alphabet); + self->alphabet = alphabet; + return size; +} + +static int +Aligner_init(Aligner *self, PyObject *args, PyObject *kwds) +{ + self->mode = Global; + self->match = 1.0; + self->mismatch = 0.0; + self->epsilon = 1.e-6; + self->target_internal_open_gap_score = 0; + self->target_internal_extend_gap_score = 0; + self->query_internal_open_gap_score = 0; + self->query_internal_extend_gap_score = 0; + self->target_left_open_gap_score = 0; + self->target_left_extend_gap_score = 0; + self->target_right_open_gap_score = 0; + self->target_right_extend_gap_score = 0; + self->query_left_open_gap_score = 0; + self->query_left_extend_gap_score = 0; + self->query_right_open_gap_score = 0; + self->query_right_extend_gap_score = 0; + self->target_gap_function = NULL; + self->query_gap_function = NULL; + self->substitution_matrix.obj = NULL; + self->substitution_matrix.buf = NULL; + self->algorithm = Unknown; + self->alphabet = NULL; + self->mapping = NULL; + self->wildcard = -1; + return 0; +} + +static void +Aligner_dealloc(Aligner* self) +{ Py_XDECREF(self->target_gap_function); + Py_XDECREF(self->query_gap_function); + if (self->substitution_matrix.obj) PyBuffer_Release(&self->substitution_matrix); + Py_XDECREF(self->alphabet); + Py_XDECREF(self->mapping); + Py_TYPE(self)->tp_free((PyObject*)self); +} + +static PyObject* +Aligner_repr(Aligner* self) +{ + const char text[] = "Pairwise aligner, implementing the Needleman-Wunsch, Smith-Waterman, Gotoh, and Waterman-Smith-Beyer global and local alignment algorithms"; + return PyUnicode_FromString(text); +} + +static PyObject* +Aligner_str(Aligner* self) +{ + char text[1024]; + char* p = text; + PyObject* substitution_matrix = self->substitution_matrix.obj; + void* args[3]; + int n = 0; + PyObject* wildcard = NULL; + PyObject* s; + + p += sprintf(p, "Pairwise sequence aligner with parameters\n"); + if (substitution_matrix) { + p += sprintf(p, " substitution_matrix: <%s object at %p>\n", + Py_TYPE(substitution_matrix)->tp_name, + substitution_matrix); + } else { + if (self->wildcard == -1) { + p += sprintf(p, " wildcard: None\n"); + } + else { + wildcard = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + &self->wildcard, 1); + if (!wildcard) return NULL; + p += sprintf(p, " wildcard: '%%U'\n"); + args[n++] = wildcard; + } + p += sprintf(p, " match_score: %f\n", self->match); + p += sprintf(p, " mismatch_score: %f\n", self->mismatch); + } + if (self->target_gap_function) { + p += sprintf(p, " target_gap_function: %%R\n"); + args[n++] = self->target_gap_function; + } + else { + p += sprintf(p, " target_internal_open_gap_score: %f\n", + self->target_internal_open_gap_score); + p += sprintf(p, " target_internal_extend_gap_score: %f\n", + self->target_internal_extend_gap_score); + p += sprintf(p, " target_left_open_gap_score: %f\n", + self->target_left_open_gap_score); + p += sprintf(p, " target_left_extend_gap_score: %f\n", + self->target_left_extend_gap_score); + p += sprintf(p, " target_right_open_gap_score: %f\n", + self->target_right_open_gap_score); + p += sprintf(p, " target_right_extend_gap_score: %f\n", + self->target_right_extend_gap_score); + } + if (self->query_gap_function) { + p += sprintf(p, " query_gap_function: %%R\n"); + args[n++] = self->query_gap_function; + } + else { + p += sprintf(p, " query_internal_open_gap_score: %f\n", + self->query_internal_open_gap_score); + p += sprintf(p, " query_internal_extend_gap_score: %f\n", + self->query_internal_extend_gap_score); + p += sprintf(p, " query_left_open_gap_score: %f\n", + self->query_left_open_gap_score); + p += sprintf(p, " query_left_extend_gap_score: %f\n", + self->query_left_extend_gap_score); + p += sprintf(p, " query_right_open_gap_score: %f\n", + self->query_right_open_gap_score); + p += sprintf(p, " query_right_extend_gap_score: %f\n", + self->query_right_extend_gap_score); + } + switch (self->mode) { + case Global: sprintf(p, " mode: global\n"); break; + case Local: sprintf(p, " mode: local\n"); break; + } + s = PyUnicode_FromFormat(text, args[0], args[1], args[2]); + Py_XDECREF(wildcard); + return s; +} + +static char Aligner_mode__doc__[] = "alignment mode ('global' or 'local')"; + +static PyObject* +Aligner_get_mode(Aligner* self, void* closure) +{ const char* message = NULL; + switch (self->mode) { + case Global: message = "global"; break; + case Local: message = "local"; break; + } + return PyUnicode_FromString(message); +} + +static int +Aligner_set_mode(Aligner* self, PyObject* value, void* closure) +{ + if (PyUnicode_Check(value)) { + if (PyUnicode_CompareWithASCIIString(value, "global") == 0) { + self->mode = Global; + return 0; + } + if (PyUnicode_CompareWithASCIIString(value, "local") == 0) { + self->mode = Local; + return 0; + } + } + PyErr_SetString(PyExc_ValueError, + "invalid mode (expected 'global' or 'local'"); + return -1; +} + +static char Aligner_match_score__doc__[] = "match score"; + +static PyObject* +Aligner_get_match_score(Aligner* self, void* closure) +{ if (self->substitution_matrix.obj) { + Py_INCREF(Py_None); + return Py_None; + } + return PyFloat_FromDouble(self->match); +} + +static int +Aligner_set_match_score(Aligner* self, PyObject* value, void* closure) +{ + const double match = PyFloat_AsDouble(value); + if (PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, "invalid match score"); + return -1; + } + if (self->substitution_matrix.obj) { + if (set_alphabet(self, Py_None) < 0) return -1; + PyBuffer_Release(&self->substitution_matrix); + } + self->match = match; + return 0; +} + +static char Aligner_mismatch_score__doc__[] = "mismatch score"; + +static PyObject* +Aligner_get_mismatch_score(Aligner* self, void* closure) +{ if (self->substitution_matrix.obj) { + Py_INCREF(Py_None); + return Py_None; + } + return PyFloat_FromDouble(self->mismatch); +} + +static int +Aligner_set_mismatch_score(Aligner* self, PyObject* value, void* closure) +{ + const double mismatch = PyFloat_AsDouble(value); + if (PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, "invalid mismatch score"); + return -1; + } + if (self->substitution_matrix.obj) { + if (set_alphabet(self, Py_None) < 0) return -1; + PyBuffer_Release(&self->substitution_matrix); + } + self->mismatch = mismatch; + return 0; +} + +static char Aligner_substitution_matrix__doc__[] = "substitution_matrix"; + +static PyObject* +Aligner_get_substitution_matrix(Aligner* self, void* closure) +{ PyObject* object = self->substitution_matrix.obj; + if (!object) object = Py_None; + Py_INCREF(object); + return object; +} + +static int +Aligner_set_substitution_matrix(Aligner* self, PyObject* values, void* closure) +{ + PyObject* alphabet; + Py_ssize_t size = -1; + Py_buffer view; + const int flag = PyBUF_FORMAT | PyBUF_ND; + if (values == Py_None) { + if (self->substitution_matrix.obj) + PyBuffer_Release(&self->substitution_matrix); + return 0; + } + if (PyObject_GetBuffer(values, &view, flag) != 0) { + PyErr_SetString(PyExc_ValueError, "expected a matrix"); + return -1; + } + if (view.ndim != 2) { + PyErr_Format(PyExc_ValueError, + "substitution matrix has incorrect rank (%d expected 2)", + view.ndim); + PyBuffer_Release(&view); + return -1; + } + if (view.len == 0) { + PyErr_SetString(PyExc_ValueError, "substitution matrix has zero size"); + PyBuffer_Release(&view); + return -1; + } + if (strcmp(view.format, "d") != 0) { + PyErr_SetString(PyExc_ValueError, + "substitution matrix should contain float values"); + PyBuffer_Release(&view); + return -1; + } + if (view.itemsize != sizeof(double)) { + PyErr_Format(PyExc_RuntimeError, + "substitution matrix has unexpected item byte size " + "(%zd, expected %zd)", view.itemsize, sizeof(double)); + PyBuffer_Release(&view); + return -1; + } + if (view.shape[0] != view.shape[1]) { + PyErr_Format(PyExc_ValueError, + "substitution matrix should be square " + "(found a %zd x %zd matrix)", + view.shape[0], view.shape[1]); + PyBuffer_Release(&view); + return -1; + } + alphabet = PyObject_GetAttrString(values, "alphabet"); + if (alphabet) { + size = set_alphabet(self, alphabet); + Py_DECREF(alphabet); + } else { + /* Set a substitution matrix without setting an alphabet; useful + * when aligning integers. */ + PyErr_Clear(); + size = set_alphabet(self, Py_None); + } + if (size < 0) { + PyBuffer_Release(&view); + return -1; + } + if (self->substitution_matrix.obj) PyBuffer_Release(&self->substitution_matrix); + self->substitution_matrix = view; + return 0; +} + +static char Aligner_alphabet__doc__[] = "alphabet"; + +static PyObject* +Aligner_get_alphabet(Aligner* self, void* closure) +{ PyObject* object = self->alphabet; + if (!object) object = Py_None; + Py_INCREF(object); + return object; +} + +static int +Aligner_set_alphabet(Aligner* self, PyObject* alphabet, void* closure) +{ + if (self->substitution_matrix.obj) { + PyErr_SetString(PyExc_AttributeError, + "can't set alphabet if a substitution matrix is used"); + return -1; + } + if (set_alphabet(self, alphabet) < 0) return -1; + return 0; +} + +static char Aligner_gap_score__doc__[] = "gap score"; + +static PyObject* +Aligner_get_gap_score(Aligner* self, void* closure) +{ + if (self->target_gap_function || self->query_gap_function) { + if (self->target_gap_function != self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + Py_INCREF(self->target_gap_function); + return self->target_gap_function; + } + else { + const double score = self->target_internal_open_gap_score; + if (score != self->target_internal_extend_gap_score + || score != self->target_left_open_gap_score + || score != self->target_left_extend_gap_score + || score != self->target_right_open_gap_score + || score != self->target_right_extend_gap_score + || score != self->query_internal_open_gap_score + || score != self->query_internal_extend_gap_score + || score != self->query_left_open_gap_score + || score != self->query_left_extend_gap_score + || score != self->query_right_open_gap_score + || score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_gap_score(Aligner* self, PyObject* value, void* closure) +{ if (PyCallable_Check(value)) { + Py_XDECREF(self->target_gap_function); + Py_XDECREF(self->query_gap_function); + Py_INCREF(value); + Py_INCREF(value); + self->target_gap_function = value; + self->query_gap_function = value; + } + else { + const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_internal_open_gap_score = score; + self->target_internal_extend_gap_score = score; + self->target_left_open_gap_score = score; + self->target_left_extend_gap_score = score; + self->target_right_open_gap_score = score; + self->target_right_extend_gap_score = score; + self->query_internal_open_gap_score = score; + self->query_internal_extend_gap_score = score; + self->query_left_open_gap_score = score; + self->query_left_extend_gap_score = score; + self->query_right_open_gap_score = score; + self->query_right_extend_gap_score = score; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_open_gap_score__doc__[] = "internal and end open gap score"; + +static PyObject* +Aligner_get_open_gap_score(Aligner* self, void* closure) +{ + if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_internal_open_gap_score; + if (score != self->target_left_open_gap_score + || score != self->target_right_open_gap_score + || score != self->query_internal_open_gap_score + || score != self->query_left_open_gap_score + || score != self->query_right_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_internal_open_gap_score = score; + self->target_left_open_gap_score = score; + self->target_right_open_gap_score = score; + self->query_internal_open_gap_score = score; + self->query_left_open_gap_score = score; + self->query_right_open_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_extend_gap_score__doc__[] = "extend gap score"; + +static PyObject* +Aligner_get_extend_gap_score(Aligner* self, void* closure) +{ + if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_internal_extend_gap_score; + if (score != self->target_left_extend_gap_score + || score != self->target_right_extend_gap_score + || score != self->query_internal_extend_gap_score + || score != self->query_left_extend_gap_score + || score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_internal_extend_gap_score = score; + self->target_left_extend_gap_score = score; + self->target_right_extend_gap_score = score; + self->query_internal_extend_gap_score = score; + self->query_left_extend_gap_score = score; + self->query_right_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_internal_gap_score__doc__[] = "internal gap score"; + +static PyObject* +Aligner_get_internal_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_internal_open_gap_score; + if (score != self->target_internal_extend_gap_score + || score != self->query_internal_open_gap_score + || score != self->query_internal_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_internal_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_internal_open_gap_score = score; + self->target_internal_extend_gap_score = score; + self->query_internal_open_gap_score = score; + self->query_internal_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_internal_open_gap_score__doc__[] = "internal open gap score"; + +static PyObject* +Aligner_get_internal_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_internal_open_gap_score; + if (score != self->query_internal_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_internal_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_internal_open_gap_score = score; + self->query_internal_open_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_internal_extend_gap_score__doc__[] = "internal extend gap score"; + +static PyObject* +Aligner_get_internal_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_internal_extend_gap_score; + if (score != self->query_internal_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_internal_extend_gap_score(Aligner* self, PyObject* value, + void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_internal_extend_gap_score = score; + self->query_internal_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_end_gap_score__doc__[] = "end gap score"; + +static PyObject* +Aligner_get_end_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_open_gap_score; + if (score != self->target_left_extend_gap_score + || score != self->target_right_open_gap_score + || score != self->target_right_extend_gap_score + || score != self->query_left_open_gap_score + || score != self->query_left_extend_gap_score + || score != self->query_right_open_gap_score + || score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_end_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_left_open_gap_score = score; + self->target_left_extend_gap_score = score; + self->target_right_open_gap_score = score; + self->target_right_extend_gap_score = score; + self->query_left_open_gap_score = score; + self->query_left_extend_gap_score = score; + self->query_right_open_gap_score = score; + self->query_right_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_end_open_gap_score__doc__[] = "end open gap score"; + +static PyObject* +Aligner_get_end_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_open_gap_score; + if (score != self->target_right_open_gap_score + || score != self->query_left_open_gap_score + || score != self->query_right_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_end_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_left_open_gap_score = score; + self->target_right_open_gap_score = score; + self->query_left_open_gap_score = score; + self->query_right_open_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_end_extend_gap_score__doc__[] = "end extend gap score"; + +static PyObject* +Aligner_get_end_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_extend_gap_score; + if (score != self->target_right_extend_gap_score + || score != self->query_left_extend_gap_score + || score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_end_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_left_extend_gap_score = score; + self->target_right_extend_gap_score = score; + self->query_left_extend_gap_score = score; + self->query_right_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_left_gap_score__doc__[] = "left gap score"; + +static PyObject* +Aligner_get_left_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_open_gap_score; + if (score != self->target_left_extend_gap_score + || score != self->query_left_open_gap_score + || score != self->query_left_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_left_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_left_open_gap_score = score; + self->target_left_extend_gap_score = score; + self->query_left_open_gap_score = score; + self->query_left_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_right_gap_score__doc__[] = "right gap score"; + +static PyObject* +Aligner_get_right_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_right_open_gap_score; + if (score != self->target_right_extend_gap_score + || score != self->query_right_open_gap_score + || score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_right_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_right_open_gap_score = score; + self->target_right_extend_gap_score = score; + self->query_right_open_gap_score = score; + self->query_right_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_left_open_gap_score__doc__[] = "left open gap score"; + +static PyObject* +Aligner_get_left_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_open_gap_score; + if (score != self->query_left_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_left_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_left_open_gap_score = score; + self->query_left_open_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_left_extend_gap_score__doc__[] = "left extend gap score"; + +static PyObject* +Aligner_get_left_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_extend_gap_score; + if (score != self->query_left_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_left_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_left_extend_gap_score = score; + self->query_left_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_right_open_gap_score__doc__[] = "right open gap score"; + +static PyObject* +Aligner_get_right_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_right_open_gap_score; + if (score != self->query_right_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_right_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_right_open_gap_score = score; + self->query_right_open_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_right_extend_gap_score__doc__[] = "right extend gap score"; + +static PyObject* +Aligner_get_right_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function || self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_right_extend_gap_score; + if (score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_right_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->target_right_extend_gap_score = score; + self->query_right_extend_gap_score = score; + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_open_gap_score__doc__[] = "target open gap score"; + +static PyObject* +Aligner_get_target_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_internal_open_gap_score; + if (score != self->target_left_open_gap_score + || score != self->target_right_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_internal_open_gap_score = score; + self->target_left_open_gap_score = score; + self->target_right_open_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_extend_gap_score__doc__[] = "target extend gap score"; + +static PyObject* +Aligner_get_target_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_internal_extend_gap_score; + if (score != self->target_left_extend_gap_score + || score != self->target_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_internal_extend_gap_score = score; + self->target_left_extend_gap_score = score; + self->target_right_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_gap_score__doc__[] = "target gap score"; + +static PyObject* +Aligner_get_target_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + Py_INCREF(self->target_gap_function); + return self->target_gap_function; + } + else { + const double score = self->target_internal_open_gap_score; + if (score != self->target_internal_extend_gap_score + || score != self->target_left_open_gap_score + || score != self->target_left_extend_gap_score + || score != self->target_right_open_gap_score + || score != self->target_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_gap_score(Aligner* self, PyObject* value, void* closure) +{ + if (PyCallable_Check(value)) { + Py_XDECREF(self->target_gap_function); + Py_INCREF(value); + self->target_gap_function = value; + } + else { + const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "gap score should be numerical or callable"); + return -1; + } + self->target_internal_open_gap_score = score; + self->target_internal_extend_gap_score = score; + self->target_left_open_gap_score = score; + self->target_left_extend_gap_score = score; + self->target_right_open_gap_score = score; + self->target_right_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_open_gap_score__doc__[] = "query gap open score"; + +static PyObject* +Aligner_get_query_open_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->query_internal_open_gap_score; + if (score != self->query_left_open_gap_score + || score != self->query_right_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_internal_open_gap_score = score; + self->query_left_open_gap_score = score; + self->query_right_open_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_extend_gap_score__doc__[] = "query gap extend score"; + +static PyObject* +Aligner_get_query_extend_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->query_internal_extend_gap_score; + if (score != self->query_left_extend_gap_score + || score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_internal_extend_gap_score = score; + self->query_left_extend_gap_score = score; + self->query_right_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_gap_score__doc__[] = "query gap score"; + +static PyObject* +Aligner_get_query_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + Py_INCREF(self->query_gap_function); + return self->query_gap_function; + } + else { + const double score = self->query_internal_open_gap_score; + if (score != self->query_left_open_gap_score + || score != self->query_right_open_gap_score + || score != self->query_internal_extend_gap_score + || score != self->query_left_extend_gap_score + || score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_gap_score(Aligner* self, PyObject* value, void* closure) +{ if (PyCallable_Check(value)) { + Py_XDECREF(self->query_gap_function); + Py_INCREF(value); + self->query_gap_function = value; + } + else { + const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "gap score should be numerical or callable"); + return -1; + } + self->query_internal_open_gap_score = score; + self->query_internal_extend_gap_score = score; + self->query_left_open_gap_score = score; + self->query_left_extend_gap_score = score; + self->query_right_open_gap_score = score; + self->query_right_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_internal_open_gap_score__doc__[] = "target internal open gap score"; + +static PyObject* +Aligner_get_target_internal_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->target_internal_open_gap_score); +} + +static int +Aligner_set_target_internal_open_gap_score(Aligner* self, + PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_internal_open_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_internal_extend_gap_score__doc__[] = "target internal extend gap score"; + +static PyObject* +Aligner_get_target_internal_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->target_internal_extend_gap_score); +} + +static int +Aligner_set_target_internal_extend_gap_score(Aligner* self, + PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_internal_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_internal_gap_score__doc__[] = "target internal gap score"; + +static PyObject* +Aligner_get_target_internal_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_internal_open_gap_score; + if (score != self->target_internal_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_internal_gap_score(Aligner* self, PyObject* value, + void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_internal_open_gap_score = score; + self->target_internal_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_end_gap_score__doc__[] = "target end gap score"; + +static PyObject* +Aligner_get_target_end_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_open_gap_score; + if (score != self->target_left_extend_gap_score + || score != self->target_right_open_gap_score + || score != self->target_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_end_gap_score(Aligner* self, PyObject* value, void* closure) { + const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_left_open_gap_score = score; + self->target_left_extend_gap_score = score; + self->target_right_open_gap_score = score; + self->target_right_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_end_open_gap_score__doc__[] = "target end open gap score"; + +static PyObject* +Aligner_get_target_end_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_open_gap_score; + if (score != self->target_right_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_end_open_gap_score(Aligner* self, PyObject* value, + void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_left_open_gap_score = score; + self->target_right_open_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_end_extend_gap_score__doc__[] = "target end extend gap score"; + +static PyObject* +Aligner_get_target_end_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_extend_gap_score; + if (score != self->target_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_end_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_left_extend_gap_score = score; + self->target_right_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_left_open_gap_score__doc__[] = "target left open score"; + +static PyObject* +Aligner_get_target_left_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->target_left_open_gap_score); +} + +static int +Aligner_set_target_left_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_left_open_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_left_extend_gap_score__doc__[] = "target left extend score"; + +static PyObject* +Aligner_get_target_left_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->target_left_extend_gap_score); +} + +static int +Aligner_set_target_left_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_left_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_left_gap_score__doc__[] = "target left score"; + +static PyObject* +Aligner_get_target_left_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_left_open_gap_score; + if (score != self->target_left_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_left_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_left_open_gap_score = score; + self->target_left_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_right_gap_score_open__doc__[] = "target right open score"; + +static PyObject* +Aligner_get_target_right_open_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->target_right_open_gap_score); +} + +static int +Aligner_set_target_right_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_right_open_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_right_extend_gap_score__doc__[] = "target right extend score"; + +static PyObject* +Aligner_get_target_right_extend_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->target_right_extend_gap_score); +} + +static int +Aligner_set_target_right_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_right_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_target_right_gap_score__doc__[] = "target right score"; + +static PyObject* +Aligner_get_target_right_gap_score(Aligner* self, void* closure) +{ if (self->target_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->target_right_open_gap_score; + if (score != self->target_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_target_right_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->target_right_open_gap_score = score; + self->target_right_extend_gap_score = score; + if (self->target_gap_function) { + Py_DECREF(self->target_gap_function); + self->target_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_end_gap_score__doc__[] = "query end score"; + +static PyObject* +Aligner_get_query_end_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->query_left_open_gap_score; + if (score != self->query_left_extend_gap_score + || score != self->query_right_open_gap_score + || score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_end_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_left_open_gap_score = score; + self->query_left_extend_gap_score = score; + self->query_right_open_gap_score = score; + self->query_right_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_end_open_gap_score__doc__[] = "query end open score"; + +static PyObject* +Aligner_get_query_end_open_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->query_left_open_gap_score; + if (score != self->query_right_open_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_end_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_left_open_gap_score = score; + self->query_right_open_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_end_extend_gap_score__doc__[] = "query end extend score"; + +static PyObject* +Aligner_get_query_end_extend_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->query_left_extend_gap_score; + if (score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_end_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_left_extend_gap_score = score; + self->query_right_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_internal_open_gap_score__doc__[] = "query internal open gap score"; + +static PyObject* +Aligner_get_query_internal_open_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->query_internal_open_gap_score); +} + +static int +Aligner_set_query_internal_open_gap_score(Aligner* self, PyObject* value, + void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_internal_open_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_internal_extend_gap_score__doc__[] = "query internal extend gap score"; + +static PyObject* +Aligner_get_query_internal_extend_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->query_internal_extend_gap_score); +} + +static int +Aligner_set_query_internal_extend_gap_score(Aligner* self, PyObject* value, + void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_internal_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_internal_gap_score__doc__[] = "query internal gap score"; + +static PyObject* +Aligner_get_query_internal_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->query_internal_open_gap_score; + if (score != self->query_internal_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_internal_gap_score(Aligner* self, PyObject* value, + void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_internal_open_gap_score = score; + self->query_internal_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_left_open_gap_score__doc__[] = "query left open score"; + +static PyObject* +Aligner_get_query_left_open_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->query_left_open_gap_score); +} + +static int +Aligner_set_query_left_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_left_open_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_left_extend_gap_score__doc__[] = "query left extend score"; + +static PyObject* +Aligner_get_query_left_extend_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->query_left_extend_gap_score); +} + +static int +Aligner_set_query_left_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_left_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_left_gap_score__doc__[] = "query left score"; + +static PyObject* +Aligner_get_query_left_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->query_left_open_gap_score; + if (score != self->query_left_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_left_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_left_open_gap_score = score; + self->query_left_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_right_open_gap_score__doc__[] = "query right open score"; + +static PyObject* +Aligner_get_query_right_open_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->query_right_open_gap_score); +} + +static int +Aligner_set_query_right_open_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_right_open_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_right_extend_gap_score__doc__[] = "query right extend score"; + +static PyObject* +Aligner_get_query_right_extend_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + return PyFloat_FromDouble(self->query_right_extend_gap_score); +} + +static int +Aligner_set_query_right_extend_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_right_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_query_right_gap_score__doc__[] = "query right score"; + +static PyObject* +Aligner_get_query_right_gap_score(Aligner* self, void* closure) +{ if (self->query_gap_function) { + PyErr_SetString(PyExc_ValueError, "using a gap score function"); + return NULL; + } + else { + const double score = self->query_right_open_gap_score; + if (score != self->query_right_extend_gap_score) { + PyErr_SetString(PyExc_ValueError, "gap scores are different"); + return NULL; + } + return PyFloat_FromDouble(score); + } +} + +static int +Aligner_set_query_right_gap_score(Aligner* self, PyObject* value, void* closure) +{ const double score = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->query_right_open_gap_score = score; + self->query_right_extend_gap_score = score; + if (self->query_gap_function) { + Py_DECREF(self->query_gap_function); + self->query_gap_function = NULL; + } + self->algorithm = Unknown; + return 0; +} + +static char Aligner_epsilon__doc__[] = "roundoff epsilon"; + +static PyObject* +Aligner_get_epsilon(Aligner* self, void* closure) +{ return PyFloat_FromDouble(self->epsilon); +} + +static int +Aligner_set_epsilon(Aligner* self, PyObject* value, void* closure) +{ const double epsilon = PyFloat_AsDouble(value); + if (PyErr_Occurred()) return -1; + self->epsilon = epsilon; + self->algorithm = Unknown; + return 0; +} + +static PyObject* +Aligner_get_wildcard(Aligner* self, void* closure) +{ + if (self->wildcard == -1) { + Py_INCREF(Py_None); + return Py_None; + } + else { + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &self->wildcard, 1); + } +} + +static int +Aligner_set_wildcard(Aligner* self, PyObject* value, void* closure) +{ + if (value == Py_None) { + self->wildcard = -1; + return 0; + } + if (!PyUnicode_Check(value)) { + PyErr_SetString(PyExc_TypeError, + "wildcard should be a single character, or None"); + return -1; + } + if (PyUnicode_READY(value) == -1) return -1; + if (PyUnicode_GET_LENGTH(value) != 1) { + PyErr_SetString(PyExc_ValueError, + "wildcard should be a single character, or None"); + return -1; + } + self->wildcard = PyUnicode_READ_CHAR(value, 0); + return 0; +} + +static char Aligner_wildcard__doc__[] = "wildcard character"; + +static Algorithm _get_algorithm(Aligner* self) +{ + Algorithm algorithm = self->algorithm; + if (algorithm == Unknown) { + const double target_gap_open = self->target_internal_open_gap_score; + const double query_gap_open = self->query_internal_open_gap_score; + const double target_gap_extend = self->target_internal_extend_gap_score; + const double query_gap_extend = self->query_internal_extend_gap_score; + const double target_left_open = self->target_left_open_gap_score; + const double target_left_extend = self->target_left_extend_gap_score; + const double query_left_open = self->query_left_open_gap_score; + const double target_right_open = self->target_right_open_gap_score; + const double query_right_open = self->query_right_open_gap_score; + const double target_right_extend = self->target_right_extend_gap_score; + const double query_left_extend = self->query_left_extend_gap_score; + const double query_right_extend = self->query_right_extend_gap_score; + if (self->target_gap_function || self->query_gap_function) + algorithm = WatermanSmithBeyer; + else if (target_gap_open == target_gap_extend + && query_gap_open == query_gap_extend + && target_left_open == target_left_extend + && target_right_open == target_right_extend + && query_left_open == query_left_extend + && query_right_open == query_right_extend) + algorithm = NeedlemanWunschSmithWaterman; + else + algorithm = Gotoh; + self->algorithm = algorithm; + } + return algorithm; +} + + +static char Aligner_algorithm__doc__[] = "alignment algorithm"; + +static PyObject* +Aligner_get_algorithm(Aligner* self, void* closure) +{ + const char* s = NULL; + const Mode mode = self->mode; + const Algorithm algorithm = _get_algorithm(self); + switch (algorithm) { + case NeedlemanWunschSmithWaterman: + switch (mode) { + case Global: + s = "Needleman-Wunsch"; + break; + case Local: + s = "Smith-Waterman"; + break; + } + break; + case Gotoh: + switch (mode) { + case Global: + s = "Gotoh global alignment algorithm"; + break; + case Local: + s = "Gotoh local alignment algorithm"; + break; + } + break; + case WatermanSmithBeyer: + switch (mode) { + case Global: + s = "Waterman-Smith-Beyer global alignment algorithm"; + break; + case Local: + s = "Waterman-Smith-Beyer local alignment algorithm"; + break; + } + break; + case Unknown: + default: + break; + } + return PyUnicode_FromString(s); +} + +static PyGetSetDef Aligner_getset[] = { + {"mode", + (getter)Aligner_get_mode, + (setter)Aligner_set_mode, + Aligner_mode__doc__, NULL}, + {"match_score", + (getter)Aligner_get_match_score, + (setter)Aligner_set_match_score, + Aligner_match_score__doc__, NULL}, + {"mismatch_score", + (getter)Aligner_get_mismatch_score, + (setter)Aligner_set_mismatch_score, + Aligner_mismatch_score__doc__, NULL}, + {"match", /* synonym for match_score */ + (getter)Aligner_get_match_score, + (setter)Aligner_set_match_score, + Aligner_match_score__doc__, NULL}, + {"mismatch", /* synonym for mismatch_score */ + (getter)Aligner_get_mismatch_score, + (setter)Aligner_set_mismatch_score, + Aligner_mismatch_score__doc__, NULL}, + {"substitution_matrix", + (getter)Aligner_get_substitution_matrix, + (setter)Aligner_set_substitution_matrix, + Aligner_substitution_matrix__doc__, NULL}, + {"alphabet", + (getter)Aligner_get_alphabet, + (setter)Aligner_set_alphabet, + Aligner_alphabet__doc__, NULL}, + {"gap_score", + (getter)Aligner_get_gap_score, + (setter)Aligner_set_gap_score, + Aligner_gap_score__doc__, NULL}, + {"open_gap_score", + (getter)Aligner_get_open_gap_score, + (setter)Aligner_set_open_gap_score, + Aligner_open_gap_score__doc__, NULL}, + {"extend_gap_score", + (getter)Aligner_get_extend_gap_score, + (setter)Aligner_set_extend_gap_score, + Aligner_extend_gap_score__doc__, NULL}, + {"internal_gap_score", + (getter)Aligner_get_internal_gap_score, + (setter)Aligner_set_internal_gap_score, + Aligner_internal_gap_score__doc__, NULL}, + {"internal_open_gap_score", + (getter)Aligner_get_internal_open_gap_score, + (setter)Aligner_set_internal_open_gap_score, + Aligner_internal_open_gap_score__doc__, NULL}, + {"internal_extend_gap_score", + (getter)Aligner_get_internal_extend_gap_score, + (setter)Aligner_set_internal_extend_gap_score, + Aligner_internal_extend_gap_score__doc__, NULL}, + {"end_gap_score", + (getter)Aligner_get_end_gap_score, + (setter)Aligner_set_end_gap_score, + Aligner_end_gap_score__doc__, NULL}, + {"end_open_gap_score", + (getter)Aligner_get_end_open_gap_score, + (setter)Aligner_set_end_open_gap_score, + Aligner_end_open_gap_score__doc__, NULL}, + {"end_extend_gap_score", + (getter)Aligner_get_end_extend_gap_score, + (setter)Aligner_set_end_extend_gap_score, + Aligner_end_extend_gap_score__doc__, NULL}, + {"left_gap_score", + (getter)Aligner_get_left_gap_score, + (setter)Aligner_set_left_gap_score, + Aligner_left_gap_score__doc__, NULL}, + {"left_open_gap_score", + (getter)Aligner_get_left_open_gap_score, + (setter)Aligner_set_left_open_gap_score, + Aligner_left_open_gap_score__doc__, NULL}, + {"left_extend_gap_score", + (getter)Aligner_get_left_extend_gap_score, + (setter)Aligner_set_left_extend_gap_score, + Aligner_left_extend_gap_score__doc__, NULL}, + {"right_gap_score", + (getter)Aligner_get_right_gap_score, + (setter)Aligner_set_right_gap_score, + Aligner_right_gap_score__doc__, NULL}, + {"right_open_gap_score", + (getter)Aligner_get_right_open_gap_score, + (setter)Aligner_set_right_open_gap_score, + Aligner_right_open_gap_score__doc__, NULL}, + {"right_extend_gap_score", + (getter)Aligner_get_right_extend_gap_score, + (setter)Aligner_set_right_extend_gap_score, + Aligner_right_extend_gap_score__doc__, NULL}, + {"target_open_gap_score", + (getter)Aligner_get_target_open_gap_score, + (setter)Aligner_set_target_open_gap_score, + Aligner_target_open_gap_score__doc__, NULL}, + {"target_extend_gap_score", + (getter)Aligner_get_target_extend_gap_score, + (setter)Aligner_set_target_extend_gap_score, + Aligner_target_extend_gap_score__doc__, NULL}, + {"target_gap_score", + (getter)Aligner_get_target_gap_score, + (setter)Aligner_set_target_gap_score, + Aligner_target_gap_score__doc__, NULL}, + {"query_open_gap_score", + (getter)Aligner_get_query_open_gap_score, + (setter)Aligner_set_query_open_gap_score, + Aligner_query_open_gap_score__doc__, NULL}, + {"query_extend_gap_score", + (getter)Aligner_get_query_extend_gap_score, + (setter)Aligner_set_query_extend_gap_score, + Aligner_query_extend_gap_score__doc__, NULL}, + {"query_gap_score", + (getter)Aligner_get_query_gap_score, + (setter)Aligner_set_query_gap_score, + Aligner_query_gap_score__doc__, NULL}, + {"target_end_gap_score", + (getter)Aligner_get_target_end_gap_score, + (setter)Aligner_set_target_end_gap_score, + Aligner_target_end_gap_score__doc__, NULL}, + {"target_end_open_gap_score", + (getter)Aligner_get_target_end_open_gap_score, + (setter)Aligner_set_target_end_open_gap_score, + Aligner_target_end_open_gap_score__doc__, NULL}, + {"target_end_extend_gap_score", + (getter)Aligner_get_target_end_extend_gap_score, + (setter)Aligner_set_target_end_extend_gap_score, + Aligner_target_end_extend_gap_score__doc__, NULL}, + {"target_internal_open_gap_score", + (getter)Aligner_get_target_internal_open_gap_score, + (setter)Aligner_set_target_internal_open_gap_score, + Aligner_target_internal_open_gap_score__doc__, NULL}, + {"target_internal_extend_gap_score", + (getter)Aligner_get_target_internal_extend_gap_score, + (setter)Aligner_set_target_internal_extend_gap_score, + Aligner_target_internal_extend_gap_score__doc__, NULL}, + {"target_internal_gap_score", + (getter)Aligner_get_target_internal_gap_score, + (setter)Aligner_set_target_internal_gap_score, + Aligner_target_internal_gap_score__doc__, NULL}, + {"target_left_open_gap_score", + (getter)Aligner_get_target_left_open_gap_score, + (setter)Aligner_set_target_left_open_gap_score, + Aligner_target_left_open_gap_score__doc__, NULL}, + {"target_left_extend_gap_score", + (getter)Aligner_get_target_left_extend_gap_score, + (setter)Aligner_set_target_left_extend_gap_score, + Aligner_target_left_extend_gap_score__doc__, NULL}, + {"target_left_gap_score", + (getter)Aligner_get_target_left_gap_score, + (setter)Aligner_set_target_left_gap_score, + Aligner_target_left_gap_score__doc__, NULL}, + {"target_right_open_gap_score", + (getter)Aligner_get_target_right_open_gap_score, + (setter)Aligner_set_target_right_open_gap_score, + Aligner_target_right_gap_score_open__doc__, NULL}, + {"target_right_extend_gap_score", + (getter)Aligner_get_target_right_extend_gap_score, + (setter)Aligner_set_target_right_extend_gap_score, + Aligner_target_right_extend_gap_score__doc__, NULL}, + {"target_right_gap_score", + (getter)Aligner_get_target_right_gap_score, + (setter)Aligner_set_target_right_gap_score, + Aligner_target_right_gap_score__doc__, NULL}, + {"query_end_gap_score", + (getter)Aligner_get_query_end_gap_score, + (setter)Aligner_set_query_end_gap_score, + Aligner_query_end_gap_score__doc__, NULL}, + {"query_end_open_gap_score", + (getter)Aligner_get_query_end_open_gap_score, + (setter)Aligner_set_query_end_open_gap_score, + Aligner_query_end_open_gap_score__doc__, NULL}, + {"query_end_extend_gap_score", + (getter)Aligner_get_query_end_extend_gap_score, + (setter)Aligner_set_query_end_extend_gap_score, + Aligner_query_end_extend_gap_score__doc__, NULL}, + {"query_internal_open_gap_score", + (getter)Aligner_get_query_internal_open_gap_score, + (setter)Aligner_set_query_internal_open_gap_score, + Aligner_query_internal_open_gap_score__doc__, NULL}, + {"query_internal_extend_gap_score", + (getter)Aligner_get_query_internal_extend_gap_score, + (setter)Aligner_set_query_internal_extend_gap_score, + Aligner_query_internal_extend_gap_score__doc__, NULL}, + {"query_internal_gap_score", + (getter)Aligner_get_query_internal_gap_score, + (setter)Aligner_set_query_internal_gap_score, + Aligner_query_internal_gap_score__doc__, NULL}, + {"query_left_open_gap_score", + (getter)Aligner_get_query_left_open_gap_score, + (setter)Aligner_set_query_left_open_gap_score, + Aligner_query_left_open_gap_score__doc__, NULL}, + {"query_left_extend_gap_score", + (getter)Aligner_get_query_left_extend_gap_score, + (setter)Aligner_set_query_left_extend_gap_score, + Aligner_query_left_extend_gap_score__doc__, NULL}, + {"query_left_gap_score", + (getter)Aligner_get_query_left_gap_score, + (setter)Aligner_set_query_left_gap_score, + Aligner_query_left_gap_score__doc__, NULL}, + {"query_right_open_gap_score", + (getter)Aligner_get_query_right_open_gap_score, + (setter)Aligner_set_query_right_open_gap_score, + Aligner_query_right_open_gap_score__doc__, NULL}, + {"query_right_extend_gap_score", + (getter)Aligner_get_query_right_extend_gap_score, + (setter)Aligner_set_query_right_extend_gap_score, + Aligner_query_right_extend_gap_score__doc__, NULL}, + {"query_right_gap_score", + (getter)Aligner_get_query_right_gap_score, + (setter)Aligner_set_query_right_gap_score, + Aligner_query_right_gap_score__doc__, NULL}, + {"epsilon", + (getter)Aligner_get_epsilon, + (setter)Aligner_set_epsilon, + Aligner_epsilon__doc__, NULL}, + {"wildcard", + (getter)Aligner_get_wildcard, + (setter)Aligner_set_wildcard, + Aligner_wildcard__doc__, NULL}, + {"algorithm", + (getter)Aligner_get_algorithm, + (setter)NULL, + Aligner_algorithm__doc__, NULL}, + {NULL} /* Sentinel */ +}; + +#define SELECT_SCORE_GLOBAL(score1, score2, score3) \ + score = score1; \ + temp = score2; \ + if (temp > score) score = temp; \ + temp = score3; \ + if (temp > score) score = temp; + +#define SELECT_SCORE_WATERMAN_SMITH_BEYER(score1, score2) \ + temp = score1 + gapscore; \ + if (temp > score) score = temp; \ + temp = score2 + gapscore; \ + if (temp > score) score = temp; + +#define SELECT_SCORE_GOTOH_LOCAL_ALIGN(score1, score2, score3, score4) \ + score = score1; \ + temp = score2; \ + if (temp > score) score = temp; \ + temp = score3; \ + if (temp > score) score = temp; \ + score += score4; \ + if (score < 0) score = 0; \ + else if (score > maximum) maximum = score; + +#define SELECT_SCORE_LOCAL3(score1, score2, score3) \ + score = score1; \ + temp = score2; \ + if (temp > score) score = temp; \ + temp = score3; \ + if (temp > score) score = temp; \ + if (score < 0) score = 0; \ + else if (score > maximum) maximum = score; + +#define SELECT_SCORE_LOCAL1(score1) \ + score = score1; \ + if (score < 0) score = 0; \ + else if (score > maximum) maximum = score; + +#define SELECT_TRACE_NEEDLEMAN_WUNSCH(hgap, vgap, align_score) \ + score = temp + (align_score); \ + trace = DIAGONAL; \ + temp = row[j-1] + hgap; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = HORIZONTAL; \ + } \ + else if (temp > score - epsilon) trace |= HORIZONTAL; \ + temp = row[j] + vgap; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = VERTICAL; \ + } \ + else if (temp > score - epsilon) trace |= VERTICAL; \ + temp = row[j]; \ + row[j] = score; \ + M[i][j].trace = trace; + +#define SELECT_TRACE_SMITH_WATERMAN_HVD(align_score) \ + trace = DIAGONAL; \ + score = temp + (align_score); \ + temp = row[j-1] + gap_extend_A; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = HORIZONTAL; \ + } \ + else if (temp > score - epsilon) trace |= HORIZONTAL; \ + temp = row[j] + gap_extend_B; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = VERTICAL; \ + } \ + else if (temp > score - epsilon) trace |= VERTICAL; \ + if (score < epsilon) { \ + score = 0; \ + trace = STARTPOINT; \ + } \ + else if (trace & DIAGONAL && score > maximum - epsilon) { \ + if (score > maximum + epsilon) { \ + for ( ; im < i; im++, jm = 0) \ + for ( ; jm <= nB; jm++) M[im][jm].trace &= ~ENDPOINT; \ + for ( ; jm < j; jm++) M[im][jm].trace &= ~ENDPOINT; \ + im = i; \ + jm = j; \ + } \ + trace |= ENDPOINT; \ + } \ + M[i][j].trace = trace; \ + if (score > maximum) maximum = score; \ + temp = row[j]; \ + row[j] = score; + +#define SELECT_TRACE_SMITH_WATERMAN_D(align_score) \ + score = temp + (align_score); \ + trace = DIAGONAL; \ + if (score < epsilon) { \ + score = 0; \ + } \ + else if (trace & DIAGONAL && score > maximum - epsilon) { \ + if (score > maximum + epsilon) { \ + for ( ; im < i; im++, jm = 0) \ + for ( ; jm <= nB; jm++) M[im][jm].trace &= ~ENDPOINT; \ + for ( ; jm < j; jm++) M[im][jm].trace &= ~ENDPOINT; \ + im = i; \ + jm = j; \ + } \ + trace |= ENDPOINT; \ + } \ + M[i][j].trace = trace; \ + if (score > maximum) maximum = score; \ + temp = row[j]; \ + row[j] = score + +#define SELECT_TRACE_GOTOH_GLOBAL_GAP(matrix, score1, score2, score3) \ + trace = M_MATRIX; \ + score = score1; \ + temp = score2; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = Ix_MATRIX; \ + } \ + else if (temp > score - epsilon) trace |= Ix_MATRIX; \ + temp = score3; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = Iy_MATRIX; \ + } \ + else if (temp > score - epsilon) trace |= Iy_MATRIX; \ + gaps[i][j].matrix = trace; + +#define SELECT_TRACE_GOTOH_GLOBAL_ALIGN \ + trace = M_MATRIX; \ + score = M_temp; \ + temp = Ix_temp; \ + if (temp > score + epsilon) { \ + score = Ix_temp; \ + trace = Ix_MATRIX; \ + } \ + else if (temp > score - epsilon) trace |= Ix_MATRIX; \ + temp = Iy_temp; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = Iy_MATRIX; \ + } \ + else if (temp > score - epsilon) trace |= Iy_MATRIX; \ + M[i][j].trace = trace; + +#define SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \ + trace = M_MATRIX; \ + score = M_temp; \ + if (Ix_temp > score + epsilon) { \ + score = Ix_temp; \ + trace = Ix_MATRIX; \ + } \ + else if (Ix_temp > score - epsilon) trace |= Ix_MATRIX; \ + if (Iy_temp > score + epsilon) { \ + score = Iy_temp; \ + trace = Iy_MATRIX; \ + } \ + else if (Iy_temp > score - epsilon) trace |= Iy_MATRIX; \ + score += (align_score); \ + if (score < epsilon) { \ + score = 0; \ + trace = STARTPOINT; \ + } \ + else if (score > maximum - epsilon) { \ + if (score > maximum + epsilon) { \ + maximum = score; \ + for ( ; im < i; im++, jm = 0) \ + for ( ; jm <= nB; jm++) M[im][jm].trace &= ~ENDPOINT; \ + for ( ; jm < j; jm++) M[im][jm].trace &= ~ENDPOINT; \ + im = i; \ + jm = j; \ + } \ + trace |= ENDPOINT; \ + } \ + M[i][j].trace = trace; + +#define SELECT_TRACE_GOTOH_LOCAL_GAP(matrix, score1, score2, score3) \ + trace = M_MATRIX; \ + score = score1; \ + temp = score2; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = Ix_MATRIX; \ + } \ + else if (temp > score - epsilon) trace |= Ix_MATRIX; \ + temp = score3; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = Iy_MATRIX; \ + } \ + else if (temp > score - epsilon) trace |= Iy_MATRIX; \ + if (score < epsilon) { \ + score = -DBL_MAX; \ + trace = 0; \ + } \ + gaps[i][j].matrix = trace; + +#define SELECT_TRACE_WATERMAN_SMITH_BEYER_GLOBAL_ALIGN(score4) \ + trace = M_MATRIX; \ + score = M_row[i-1][j-1]; \ + temp = Ix_row[i-1][j-1]; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = Ix_MATRIX; \ + } \ + else if (temp > score - epsilon) trace |= Ix_MATRIX; \ + temp = Iy_row[i-1][j-1]; \ + if (temp > score + epsilon) { \ + score = temp; \ + trace = Iy_MATRIX; \ + } \ + else if (temp > score - epsilon) trace |= Iy_MATRIX; \ + M_row[i][j] = score + score4; \ + M[i][j].trace = trace; + +#define SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(score1, score2) \ + temp = score1 + gapscore; \ + if (temp > score - epsilon) { \ + if (temp > score + epsilon) { \ + score = temp; \ + nm = 0; \ + ng = 0; \ + } \ + gapM[nm] = gap; \ + nm++; \ + } \ + temp = score2 + gapscore; \ + if (temp > score - epsilon) { \ + if (temp > score + epsilon) { \ + score = temp; \ + nm = 0; \ + ng = 0; \ + } \ + gapXY[ng] = gap; \ + ng++; \ + } + +#define SELECT_TRACE_WATERMAN_SMITH_BEYER_ALIGN(score1, score2, score3, score4) \ + trace = M_MATRIX; \ + score = score1; \ + if (score2 > score + epsilon) { \ + score = score2; \ + trace = Ix_MATRIX; \ + } \ + else if (score2 > score - epsilon) trace |= Ix_MATRIX; \ + if (score3 > score + epsilon) { \ + score = score3; \ + trace = Iy_MATRIX; \ + } \ + else if (score3 > score - epsilon) trace |= Iy_MATRIX; \ + score += score4; \ + if (score < epsilon) { \ + score = 0; \ + trace = STARTPOINT; \ + } \ + else if (score > maximum - epsilon) { \ + if (score > maximum + epsilon) { \ + maximum = score; \ + for ( ; im < i; im++, jm = 0) \ + for ( ; jm <= nB; jm++) M[im][jm].trace &= ~ENDPOINT; \ + for ( ; jm < j; jm++) M[im][jm].trace &= ~ENDPOINT; \ + im = i; \ + jm = j; \ + } \ + trace |= ENDPOINT; \ + } \ + M_row[i][j] = score; \ + M[i][j].trace = trace; + +/* ----------------- alignment algorithms ----------------- */ + +#define NEEDLEMANWUNSCH_SCORE(align_score) \ + int i; \ + int j; \ + int kA; \ + int kB; \ + const double gap_extend_A = self->target_internal_extend_gap_score; \ + const double gap_extend_B = self->query_internal_extend_gap_score; \ + double score; \ + double temp; \ + double* row; \ + double left_gap_extend_A; \ + double right_gap_extend_A; \ + double left_gap_extend_B; \ + double right_gap_extend_B; \ + switch (strand) { \ + case '+': \ + left_gap_extend_A = self->target_left_extend_gap_score; \ + right_gap_extend_A = self->target_right_extend_gap_score; \ + left_gap_extend_B = self->query_left_extend_gap_score; \ + right_gap_extend_B = self->query_right_extend_gap_score; \ + break; \ + case '-': \ + left_gap_extend_A = self->target_right_extend_gap_score; \ + right_gap_extend_A = self->target_left_extend_gap_score; \ + left_gap_extend_B = self->query_right_extend_gap_score; \ + right_gap_extend_B = self->query_left_extend_gap_score; \ + break; \ + default: \ + PyErr_SetString(PyExc_RuntimeError, "strand was neither '+' nor '-'"); \ + return NULL; \ + } \ +\ + /* Needleman-Wunsch algorithm */ \ + row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!row) return PyErr_NoMemory(); \ +\ + /* The top row of the score matrix is a special case, \ + * as there are no previously aligned characters. \ + */ \ + row[0] = 0.0; \ + for (j = 1; j <= nB; j++) row[j] = j * left_gap_extend_A; \ + for (i = 1; i < nA; i++) { \ + kA = sA[i-1]; \ + temp = row[0]; \ + row[0] = i * left_gap_extend_B; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_GLOBAL(temp + (align_score), \ + row[j] + gap_extend_B, \ + row[j-1] + gap_extend_A); \ + temp = row[j]; \ + row[j] = score; \ + } \ + kB = sB[nB-1]; \ + SELECT_SCORE_GLOBAL(temp + (align_score), \ + row[nB] + right_gap_extend_B, \ + row[nB-1] + gap_extend_A); \ + temp = row[nB]; \ + row[nB] = score; \ + } \ + kA = sA[nA-1]; \ + temp = row[0]; \ + row[0] = nA * right_gap_extend_B; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_GLOBAL(temp + (align_score), \ + row[j] + gap_extend_B, \ + row[j-1] + right_gap_extend_A); \ + temp = row[j]; \ + row[j] = score; \ + } \ + kB = sB[nB-1]; \ + SELECT_SCORE_GLOBAL(temp + (align_score), \ + row[nB] + right_gap_extend_B, \ + row[nB-1] + right_gap_extend_A); \ + PyMem_Free(row); \ + return PyFloat_FromDouble(score); + + +#define SMITHWATERMAN_SCORE(align_score) \ + int i; \ + int j; \ + int kA; \ + int kB; \ + const double gap_extend_A = self->target_internal_extend_gap_score; \ + const double gap_extend_B = self->query_internal_extend_gap_score; \ + double score; \ + double* row; \ + double temp; \ + double maximum = 0; \ +\ + /* Smith-Waterman algorithm */ \ + row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!row) return PyErr_NoMemory(); \ +\ + /* The top row of the score matrix is a special case, \ + * as there are no previously aligned characters. \ + */ \ + for (j = 0; j <= nB; j++) \ + row[j] = 0; \ + for (i = 1; i < nA; i++) { \ + kA = sA[i-1]; \ + temp = 0; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_LOCAL3(temp + (align_score), \ + row[j] + gap_extend_B, \ + row[j-1] + gap_extend_A); \ + temp = row[j]; \ + row[j] = score; \ + } \ + kB = sB[nB-1]; \ + SELECT_SCORE_LOCAL1(temp + (align_score)); \ + temp = row[nB]; \ + row[nB] = score; \ + } \ + kA = sA[nA-1]; \ + temp = 0; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_LOCAL1(temp + (align_score)); \ + temp = row[j]; \ + row[j] = score; \ + } \ + kB = sB[nB-1]; \ + SELECT_SCORE_LOCAL1(temp + (align_score)); \ + PyMem_Free(row); \ + return PyFloat_FromDouble(maximum); + + +#define NEEDLEMANWUNSCH_ALIGN(align_score) \ + int i; \ + int j; \ + int kA; \ + int kB; \ + const double gap_extend_A = self->target_internal_extend_gap_score; \ + const double gap_extend_B = self->query_internal_extend_gap_score; \ + const double epsilon = self->epsilon; \ + Trace** M; \ + double score; \ + int trace; \ + double temp; \ + double* row = NULL; \ + PathGenerator* paths; \ + double left_gap_extend_A; \ + double right_gap_extend_A; \ + double left_gap_extend_B; \ + double right_gap_extend_B; \ + switch (strand) { \ + case '+': \ + left_gap_extend_A = self->target_left_extend_gap_score; \ + right_gap_extend_A = self->target_right_extend_gap_score; \ + left_gap_extend_B = self->query_left_extend_gap_score; \ + right_gap_extend_B = self->query_right_extend_gap_score; \ + break; \ + case '-': \ + left_gap_extend_A = self->target_right_extend_gap_score; \ + right_gap_extend_A = self->target_left_extend_gap_score; \ + left_gap_extend_B = self->query_right_extend_gap_score; \ + right_gap_extend_B = self->query_left_extend_gap_score; \ + break; \ + default: \ + PyErr_SetString(PyExc_RuntimeError, "strand was neither '+' nor '-'"); \ + return NULL; \ + } \ +\ + /* Needleman-Wunsch algorithm */ \ + paths = PathGenerator_create_NWSW(nA, nB, Global, strand); \ + if (!paths) return NULL; \ + row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!row) { \ + Py_DECREF(paths); \ + return PyErr_NoMemory(); \ + } \ + M = paths->M; \ + row[0] = 0; \ + for (j = 1; j <= nB; j++) row[j] = j * left_gap_extend_A; \ + for (i = 1; i < nA; i++) { \ + temp = row[0]; \ + row[0] = i * left_gap_extend_B; \ + kA = sA[i-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_NEEDLEMAN_WUNSCH(gap_extend_A, gap_extend_B, align_score); \ + } \ + kB = sB[j-1]; \ + SELECT_TRACE_NEEDLEMAN_WUNSCH(gap_extend_A, right_gap_extend_B, align_score); \ + } \ + temp = row[0]; \ + row[0] = i * left_gap_extend_B; \ + kA = sA[nA-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_NEEDLEMAN_WUNSCH(right_gap_extend_A, gap_extend_B, align_score); \ + } \ + kB = sB[j-1]; \ + SELECT_TRACE_NEEDLEMAN_WUNSCH(right_gap_extend_A, right_gap_extend_B, align_score); \ + PyMem_Free(row); \ + M[nA][nB].path = 0; \ + return Py_BuildValue("fN", score, paths); + + +#define SMITHWATERMAN_ALIGN(align_score) \ + int i; \ + int j; \ + int im = nA; \ + int jm = nB; \ + int kA; \ + int kB; \ + const double gap_extend_A = self->target_internal_extend_gap_score; \ + const double gap_extend_B = self->query_internal_extend_gap_score; \ + const double epsilon = self->epsilon; \ + Trace** M = NULL; \ + double maximum = 0; \ + double score = 0; \ + double* row = NULL; \ + double temp; \ + int trace; \ + PathGenerator* paths = NULL; \ +\ + /* Smith-Waterman algorithm */ \ + paths = PathGenerator_create_NWSW(nA, nB, Local, strand); \ + if (!paths) return NULL; \ + row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!row) { \ + Py_DECREF(paths); \ + return PyErr_NoMemory(); \ + } \ + M = paths->M; \ + for (j = 0; j <= nB; j++) row[j] = 0; \ + for (i = 1; i < nA; i++) { \ + temp = 0; \ + kA = sA[i-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_SMITH_WATERMAN_HVD(align_score); \ + } \ + kB = sB[nB-1]; \ + SELECT_TRACE_SMITH_WATERMAN_D(align_score); \ + } \ + temp = 0; \ + kA = sA[nA-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_SMITH_WATERMAN_D(align_score); \ + } \ + kB = sB[nB-1]; \ + SELECT_TRACE_SMITH_WATERMAN_D(align_score); \ + PyMem_Free(row); \ +\ + /* As we don't allow zero-score extensions to alignments, \ + * we need to remove all traces towards an ENDPOINT. \ + * In addition, some points then won't have any path to a STARTPOINT. \ + * Here, use path as a temporary variable to indicate if the point \ + * is reachable from a STARTPOINT. If it is unreachable, remove all \ + * traces from it, and don't allow it to be an ENDPOINT. It may still \ + * be a valid STARTPOINT. */ \ + for (j = 0; j <= nB; j++) M[0][j].path = 1; \ + for (i = 1; i <= nA; i++) { \ + M[i][0].path = 1; \ + for (j = 1; j <= nB; j++) { \ + trace = M[i][j].trace; \ + /* Remove traces to unreachable points. */ \ + if (!M[i-1][j-1].path) trace &= ~DIAGONAL; \ + if (!M[i][j-1].path) trace &= ~HORIZONTAL; \ + if (!M[i-1][j].path) trace &= ~VERTICAL; \ + if (trace & (STARTPOINT | HORIZONTAL | VERTICAL | DIAGONAL)) { \ + /* The point is reachable. */ \ + if (trace & ENDPOINT) M[i][j].path = 0; /* no extensions after ENDPOINT */ \ + else M[i][j].path = 1; \ + } \ + else { \ + /* The point is not reachable. Then it is not a STARTPOINT, \ + * all traces from it can be removed, and it cannot act as \ + * an ENDPOINT. */ \ + M[i][j].path = 0; \ + trace = 0; \ + } \ + M[i][j].trace = trace; \ + } \ + } \ + if (maximum == 0) M[0][0].path = NONE; \ + else M[0][0].path = 0; \ + return Py_BuildValue("fN", maximum, paths); + + +#define GOTOH_GLOBAL_SCORE(align_score) \ + int i; \ + int j; \ + int kA; \ + int kB; \ + const double gap_open_A = self->target_internal_open_gap_score; \ + const double gap_open_B = self->query_internal_open_gap_score; \ + const double gap_extend_A = self->target_internal_extend_gap_score; \ + const double gap_extend_B = self->query_internal_extend_gap_score; \ + double left_gap_open_A; \ + double left_gap_open_B; \ + double left_gap_extend_A; \ + double left_gap_extend_B; \ + double right_gap_open_A; \ + double right_gap_open_B; \ + double right_gap_extend_A; \ + double right_gap_extend_B; \ + double* M_row = NULL; \ + double* Ix_row = NULL; \ + double* Iy_row = NULL; \ + double score; \ + double temp; \ + double M_temp; \ + double Ix_temp; \ + double Iy_temp; \ + switch (strand) { \ + case '+': \ + left_gap_open_A = self->target_left_open_gap_score; \ + left_gap_open_B = self->query_left_open_gap_score; \ + left_gap_extend_A = self->target_left_extend_gap_score; \ + left_gap_extend_B = self->query_left_extend_gap_score; \ + right_gap_open_A = self->target_right_open_gap_score; \ + right_gap_open_B = self->query_right_open_gap_score; \ + right_gap_extend_A = self->target_right_extend_gap_score; \ + right_gap_extend_B = self->query_right_extend_gap_score; \ + break; \ + case '-': \ + left_gap_open_A = self->target_right_open_gap_score; \ + left_gap_open_B = self->query_right_open_gap_score; \ + left_gap_extend_A = self->target_right_extend_gap_score; \ + left_gap_extend_B = self->query_right_extend_gap_score; \ + right_gap_open_A = self->target_left_open_gap_score; \ + right_gap_open_B = self->query_left_open_gap_score; \ + right_gap_extend_A = self->target_left_extend_gap_score; \ + right_gap_extend_B = self->query_left_extend_gap_score; \ + break; \ + default: \ + PyErr_SetString(PyExc_RuntimeError, "strand was neither '+' nor '-'"); \ + return NULL; \ + } \ +\ + /* Gotoh algorithm with three states */ \ + M_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!M_row) goto exit; \ + Ix_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Ix_row) goto exit; \ + Iy_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Iy_row) goto exit; \ +\ + /* The top row of the score matrix is a special case, \ + * as there are no previously aligned characters. \ + */ \ + M_row[0] = 0; \ + Ix_row[0] = -DBL_MAX; \ + Iy_row[0] = -DBL_MAX; \ + for (j = 1; j <= nB; j++) { \ + M_row[j] = -DBL_MAX; \ + Ix_row[j] = -DBL_MAX; \ + Iy_row[j] = left_gap_open_A + left_gap_extend_A * (j-1); \ + } \ +\ + for (i = 1; i < nA; i++) { \ + M_temp = M_row[0]; \ + Ix_temp = Ix_row[0]; \ + Iy_temp = Iy_row[0]; \ + M_row[0] = -DBL_MAX; \ + Ix_row[0] = left_gap_open_B + left_gap_extend_B * (i-1); \ + Iy_row[0] = -DBL_MAX; \ + kA = sA[i-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_GLOBAL(M_temp, \ + Ix_temp, \ + Iy_temp); \ + M_temp = M_row[j]; \ + M_row[j] = score + (align_score); \ + SELECT_SCORE_GLOBAL(M_temp + gap_open_B, \ + Ix_row[j] + gap_extend_B, \ + Iy_row[j] + gap_open_B); \ + Ix_temp = Ix_row[j]; \ + Ix_row[j] = score; \ + SELECT_SCORE_GLOBAL(M_row[j-1] + gap_open_A, \ + Ix_row[j-1] + gap_open_A, \ + Iy_row[j-1] + gap_extend_A); \ + Iy_temp = Iy_row[j]; \ + Iy_row[j] = score; \ + } \ + kB = sB[nB-1]; \ + SELECT_SCORE_GLOBAL(M_temp, \ + Ix_temp, \ + Iy_temp); \ + M_temp = M_row[nB]; \ + M_row[nB] = score + (align_score); \ + SELECT_SCORE_GLOBAL(M_temp + right_gap_open_B, \ + Ix_row[nB] + right_gap_extend_B, \ + Iy_row[nB] + right_gap_open_B); \ + Ix_row[nB] = score; \ + SELECT_SCORE_GLOBAL(M_row[nB-1] + gap_open_A, \ + Iy_row[nB-1] + gap_extend_A, \ + Ix_row[nB-1] + gap_open_A); \ + Iy_row[nB] = score; \ + } \ +\ + M_temp = M_row[0]; \ + Ix_temp = Ix_row[0]; \ + Iy_temp = Iy_row[0]; \ + M_row[0] = -DBL_MAX; \ + Ix_row[0] = left_gap_open_B + left_gap_extend_B * (i-1); \ + Iy_row[0] = -DBL_MAX; \ + kA = sA[nA-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_GLOBAL(M_temp, \ + Ix_temp, \ + Iy_temp); \ + M_temp = M_row[j]; \ + M_row[j] = score + (align_score); \ + SELECT_SCORE_GLOBAL(M_temp + gap_open_B, \ + Ix_row[j] + gap_extend_B, \ + Iy_row[j] + gap_open_B); \ + Ix_temp = Ix_row[j]; \ + Ix_row[j] = score; \ + SELECT_SCORE_GLOBAL(M_row[j-1] + right_gap_open_A, \ + Iy_row[j-1] + right_gap_extend_A, \ + Ix_row[j-1] + right_gap_open_A); \ + Iy_temp = Iy_row[j]; \ + Iy_row[j] = score; \ + } \ +\ + kB = sB[nB-1]; \ + SELECT_SCORE_GLOBAL(M_temp, \ + Ix_temp, \ + Iy_temp); \ + M_temp = M_row[nB]; \ + M_row[nB] = score + (align_score); \ + SELECT_SCORE_GLOBAL(M_temp + right_gap_open_B, \ + Ix_row[nB] + right_gap_extend_B, \ + Iy_row[nB] + right_gap_open_B); \ + Ix_temp = Ix_row[nB]; \ + Ix_row[nB] = score; \ + SELECT_SCORE_GLOBAL(M_row[nB-1] + right_gap_open_A, \ + Ix_row[nB-1] + right_gap_open_A, \ + Iy_row[nB-1] + right_gap_extend_A); \ + Iy_temp = Iy_row[nB]; \ + Iy_row[nB] = score; \ +\ + SELECT_SCORE_GLOBAL(M_row[nB], Ix_row[nB], Iy_row[nB]); \ + PyMem_Free(M_row); \ + PyMem_Free(Ix_row); \ + PyMem_Free(Iy_row); \ + return PyFloat_FromDouble(score); \ +\ +exit: \ + if (M_row) PyMem_Free(M_row); \ + if (Ix_row) PyMem_Free(Ix_row); \ + if (Iy_row) PyMem_Free(Iy_row); \ + return PyErr_NoMemory(); \ + + +#define GOTOH_LOCAL_SCORE(align_score) \ + int i; \ + int j; \ + int kA; \ + int kB; \ + const double gap_open_A = self->target_internal_open_gap_score; \ + const double gap_open_B = self->query_internal_open_gap_score; \ + const double gap_extend_A = self->target_internal_extend_gap_score; \ + const double gap_extend_B = self->query_internal_extend_gap_score; \ + double* M_row = NULL; \ + double* Ix_row = NULL; \ + double* Iy_row = NULL; \ + double score; \ + double temp; \ + double M_temp; \ + double Ix_temp; \ + double Iy_temp; \ + double maximum = 0.0; \ +\ + /* Gotoh algorithm with three states */ \ + M_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!M_row) goto exit; \ + Ix_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Ix_row) goto exit; \ + Iy_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Iy_row) goto exit; \ + \ + /* The top row of the score matrix is a special case, \ + * as there are no previously aligned characters. \ + */ \ + M_row[0] = 0; \ + Ix_row[0] = -DBL_MAX; \ + Iy_row[0] = -DBL_MAX; \ + for (j = 1; j <= nB; j++) { \ + M_row[j] = -DBL_MAX; \ + Ix_row[j] = -DBL_MAX; \ + Iy_row[j] = 0; \ + } \ + for (i = 1; i < nA; i++) { \ + M_temp = M_row[0]; \ + Ix_temp = Ix_row[0]; \ + Iy_temp = Iy_row[0]; \ + M_row[0] = -DBL_MAX; \ + Ix_row[0] = 0; \ + Iy_row[0] = -DBL_MAX; \ + kA = sA[i-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_GOTOH_LOCAL_ALIGN(M_temp, \ + Ix_temp, \ + Iy_temp, \ + (align_score)); \ + M_temp = M_row[j]; \ + M_row[j] = score; \ + SELECT_SCORE_LOCAL3(M_temp + gap_open_B, \ + Ix_row[j] + gap_extend_B, \ + Iy_row[j] + gap_open_B); \ + Ix_temp = Ix_row[j]; \ + Ix_row[j] = score; \ + SELECT_SCORE_LOCAL3(M_row[j-1] + gap_open_A, \ + Ix_row[j-1] + gap_open_A, \ + Iy_row[j-1] + gap_extend_A); \ + Iy_temp = Iy_row[j]; \ + Iy_row[j] = score; \ + } \ + kB = sB[nB-1]; \ + Ix_row[nB] = 0; \ + Iy_row[nB] = 0; \ + SELECT_SCORE_GOTOH_LOCAL_ALIGN(M_temp, \ + Ix_temp, \ + Iy_temp, \ + (align_score)); \ + M_temp = M_row[nB]; \ + M_row[nB] = score; \ + } \ + M_temp = M_row[0]; \ + Ix_temp = Ix_row[0]; \ + Iy_temp = Iy_row[0]; \ + M_row[0] = -DBL_MAX; \ + Ix_row[0] = 0; \ + Iy_row[0] = -DBL_MAX; \ + kA = sA[nA-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_GOTOH_LOCAL_ALIGN(M_temp, \ + Ix_temp, \ + Iy_temp, \ + (align_score)); \ + M_temp = M_row[j]; \ + M_row[j] = score; \ + Ix_temp = Ix_row[j]; \ + Iy_temp = Iy_row[j]; \ + Ix_row[j] = 0; \ + Iy_row[j] = 0; \ + } \ + kB = sB[nB-1]; \ + SELECT_SCORE_GOTOH_LOCAL_ALIGN(M_temp, \ + Ix_temp, \ + Iy_temp, \ + (align_score)); \ + PyMem_Free(M_row); \ + PyMem_Free(Ix_row); \ + PyMem_Free(Iy_row); \ + return PyFloat_FromDouble(maximum); \ +exit: \ + if (M_row) PyMem_Free(M_row); \ + if (Ix_row) PyMem_Free(Ix_row); \ + if (Iy_row) PyMem_Free(Iy_row); \ + return PyErr_NoMemory(); \ + + +#define GOTOH_GLOBAL_ALIGN(align_score) \ + int i; \ + int j; \ + int kA; \ + int kB; \ + const double gap_open_A = self->target_internal_open_gap_score; \ + const double gap_open_B = self->query_internal_open_gap_score; \ + const double gap_extend_A = self->target_internal_extend_gap_score; \ + const double gap_extend_B = self->query_internal_extend_gap_score; \ + double left_gap_open_A; \ + double left_gap_open_B; \ + double left_gap_extend_A; \ + double left_gap_extend_B; \ + double right_gap_open_A; \ + double right_gap_open_B; \ + double right_gap_extend_A; \ + double right_gap_extend_B; \ + const double epsilon = self->epsilon; \ + TraceGapsGotoh** gaps = NULL; \ + Trace** M = NULL; \ + double* M_row = NULL; \ + double* Ix_row = NULL; \ + double* Iy_row = NULL; \ + double score; \ + int trace; \ + double temp; \ + double M_temp; \ + double Ix_temp; \ + double Iy_temp; \ + PathGenerator* paths; \ + switch (strand) { \ + case '+': \ + left_gap_open_A = self->target_left_open_gap_score; \ + left_gap_open_B = self->query_left_open_gap_score; \ + left_gap_extend_A = self->target_left_extend_gap_score; \ + left_gap_extend_B = self->query_left_extend_gap_score; \ + right_gap_open_A = self->target_right_open_gap_score; \ + right_gap_open_B = self->query_right_open_gap_score; \ + right_gap_extend_A = self->target_right_extend_gap_score; \ + right_gap_extend_B = self->query_right_extend_gap_score; \ + break; \ + case '-': \ + left_gap_open_A = self->target_right_open_gap_score; \ + left_gap_open_B = self->query_right_open_gap_score; \ + left_gap_extend_A = self->target_right_extend_gap_score; \ + left_gap_extend_B = self->query_right_extend_gap_score; \ + right_gap_open_A = self->target_left_open_gap_score; \ + right_gap_open_B = self->query_left_open_gap_score; \ + right_gap_extend_A = self->target_left_extend_gap_score; \ + right_gap_extend_B = self->query_left_extend_gap_score; \ + break; \ + default: \ + PyErr_SetString(PyExc_RuntimeError, "strand was neither '+' nor '-'"); \ + return NULL; \ + } \ +\ + /* Gotoh algorithm with three states */ \ + paths = PathGenerator_create_Gotoh(nA, nB, Global, strand); \ + if (!paths) return NULL; \ + M_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!M_row) goto exit; \ + Ix_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Ix_row) goto exit; \ + Iy_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Iy_row) goto exit; \ + M = paths->M; \ + gaps = paths->gaps.gotoh; \ + \ + /* Gotoh algorithm with three states */ \ + M_row[0] = 0; \ + Ix_row[0] = -DBL_MAX; \ + Iy_row[0] = -DBL_MAX; \ + for (j = 1; j <= nB; j++) { \ + M_row[j] = -DBL_MAX; \ + Ix_row[j] = -DBL_MAX; \ + Iy_row[j] = left_gap_open_A + left_gap_extend_A * (j-1); \ + } \ + for (i = 1; i < nA; i++) { \ + kA = sA[i-1]; \ + M_temp = M_row[0]; \ + Ix_temp = Ix_row[0]; \ + Iy_temp = Iy_row[0]; \ + M_row[0] = -DBL_MAX; \ + Ix_row[0] = left_gap_open_B + left_gap_extend_B * (i-1); \ + Iy_row[0] = -DBL_MAX; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_GOTOH_GLOBAL_ALIGN; \ + M_temp = M_row[j]; \ + M_row[j] = score + (align_score); \ + SELECT_TRACE_GOTOH_GLOBAL_GAP(Ix, \ + M_temp + gap_open_B, \ + Ix_row[j] + gap_extend_B, \ + Iy_row[j] + gap_open_B); \ + Ix_temp = Ix_row[j]; \ + Ix_row[j] = score; \ + SELECT_TRACE_GOTOH_GLOBAL_GAP(Iy, \ + M_row[j-1] + gap_open_A, \ + Ix_row[j-1] + gap_open_A, \ + Iy_row[j-1] + gap_extend_A); \ + Iy_temp = Iy_row[j]; \ + Iy_row[j] = score; \ + } \ + kB = sB[nB-1]; \ + SELECT_TRACE_GOTOH_GLOBAL_ALIGN; \ + M_temp = M_row[nB]; \ + M_row[nB] = score + (align_score); \ + SELECT_TRACE_GOTOH_GLOBAL_GAP(Ix, \ + M_temp + right_gap_open_B, \ + Ix_row[nB] + right_gap_extend_B, \ + Iy_row[nB] + right_gap_open_B); \ + Ix_temp = Ix_row[nB]; \ + Ix_row[nB] = score; \ + SELECT_TRACE_GOTOH_GLOBAL_GAP(Iy, \ + M_row[nB-1] + gap_open_A, \ + Ix_row[nB-1] + gap_open_A, \ + Iy_row[nB-1] + gap_extend_A); \ + Iy_temp = Iy_row[nB]; \ + Iy_row[nB] = score; \ + } \ + kA = sA[nA-1]; \ + M_temp = M_row[0]; \ + Ix_temp = Ix_row[0]; \ + Iy_temp = Iy_row[0]; \ + M_row[0] = -DBL_MAX; \ + Ix_row[0] = left_gap_open_B + left_gap_extend_B * (nA-1); \ + Iy_row[0] = -DBL_MAX; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_GOTOH_GLOBAL_ALIGN; \ + M_temp = M_row[j]; \ + M_row[j] = score + (align_score); \ + SELECT_TRACE_GOTOH_GLOBAL_GAP(Ix, \ + M_temp + gap_open_B, \ + Ix_row[j] + gap_extend_B, \ + Iy_row[j] + gap_open_B); \ + Ix_temp = Ix_row[j]; \ + Ix_row[j] = score; \ + SELECT_TRACE_GOTOH_GLOBAL_GAP(Iy, \ + M_row[j-1] + right_gap_open_A, \ + Ix_row[j-1] + right_gap_open_A, \ + Iy_row[j-1] + right_gap_extend_A); \ + Iy_temp = Iy_row[j]; \ + Iy_row[j] = score; \ + } \ + kB = sB[nB-1]; \ + SELECT_TRACE_GOTOH_GLOBAL_ALIGN; \ + M_temp = M_row[j]; \ + M_row[j] = score + (align_score); \ + SELECT_TRACE_GOTOH_GLOBAL_GAP(Ix, \ + M_temp + right_gap_open_B, \ + Ix_row[j] + right_gap_extend_B, \ + Iy_row[j] + right_gap_open_B); \ + Ix_row[nB] = score; \ + SELECT_TRACE_GOTOH_GLOBAL_GAP(Iy, \ + M_row[j-1] + right_gap_open_A, \ + Ix_row[j-1] + right_gap_open_A, \ + Iy_row[j-1] + right_gap_extend_A); \ + Iy_row[nB] = score; \ + M[nA][nB].path = 0; \ + \ + /* traceback */ \ + SELECT_SCORE_GLOBAL(M_row[nB], Ix_row[nB], Iy_row[nB]); \ + if (M_row[nB] < score - epsilon) M[nA][nB].trace = 0; \ + if (Ix_row[nB] < score - epsilon) gaps[nA][nB].Ix = 0; \ + if (Iy_row[nB] < score - epsilon) gaps[nA][nB].Iy = 0; \ + return Py_BuildValue("fN", score, paths); \ +exit: \ + Py_DECREF(paths); \ + if (M_row) PyMem_Free(M_row); \ + if (Ix_row) PyMem_Free(Ix_row); \ + if (Iy_row) PyMem_Free(Iy_row); \ + return PyErr_NoMemory(); \ + + +#define GOTOH_LOCAL_ALIGN(align_score) \ + int i; \ + int j; \ + int im = nA; \ + int jm = nB; \ + int kA; \ + int kB; \ + const double gap_open_A = self->target_internal_open_gap_score; \ + const double gap_open_B = self->query_internal_open_gap_score; \ + const double gap_extend_A = self->target_internal_extend_gap_score; \ + const double gap_extend_B = self->query_internal_extend_gap_score; \ + const double epsilon = self->epsilon; \ + Trace** M = NULL; \ + TraceGapsGotoh** gaps = NULL; \ + double* M_row = NULL; \ + double* Ix_row = NULL; \ + double* Iy_row = NULL; \ + double score; \ + int trace; \ + double temp; \ + double M_temp; \ + double Ix_temp; \ + double Iy_temp; \ + double maximum = 0.0; \ + PathGenerator* paths; \ + \ + /* Gotoh algorithm with three states */ \ + paths = PathGenerator_create_Gotoh(nA, nB, Local, strand); \ + if (!paths) return NULL; \ + M = paths->M; \ + gaps = paths->gaps.gotoh; \ + M_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!M_row) goto exit; \ + Ix_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Ix_row) goto exit; \ + Iy_row = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Iy_row) goto exit; \ + M_row[0] = 0; \ + Ix_row[0] = -DBL_MAX; \ + Iy_row[0] = -DBL_MAX; \ + for (j = 1; j <= nB; j++) { \ + M_row[j] = 0; \ + Ix_row[j] = -DBL_MAX; \ + Iy_row[j] = -DBL_MAX; \ + } \ + for (i = 1; i < nA; i++) { \ + M_temp = M_row[0]; \ + Ix_temp = Ix_row[0]; \ + Iy_temp = Iy_row[0]; \ + M_row[0] = 0; \ + Ix_row[0] = -DBL_MAX; \ + Iy_row[0] = -DBL_MAX; \ + kA = sA[i-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \ + M_temp = M_row[j]; \ + M_row[j] = score; \ + SELECT_TRACE_GOTOH_LOCAL_GAP(Ix, \ + M_temp + gap_open_B, \ + Ix_row[j] + gap_extend_B, \ + Iy_row[j] + gap_open_B); \ + Ix_temp = Ix_row[j]; \ + Ix_row[j] = score; \ + SELECT_TRACE_GOTOH_LOCAL_GAP(Iy, \ + M_row[j-1] + gap_open_A, \ + Ix_row[j-1] + gap_open_A, \ + Iy_row[j-1] + gap_extend_A); \ + Iy_temp = Iy_row[j]; \ + Iy_row[j] = score; \ + } \ + kB = sB[nB-1]; \ + SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \ + M_temp = M_row[j]; \ + M_row[j] = score; \ + Ix_temp = Ix_row[nB]; \ + Ix_row[nB] = 0; \ + gaps[i][nB].Ix = 0; \ + Iy_temp = Iy_row[nB]; \ + Iy_row[nB] = 0; \ + gaps[i][nB].Iy = 0; \ + } \ + M_temp = M_row[0]; \ + M_row[0] = 0; \ + M[nA][0].trace = 0; \ + Ix_temp = Ix_row[0]; \ + Ix_row[0] = -DBL_MAX; \ + gaps[nA][0].Ix = 0; \ + gaps[nA][0].Iy = 0; \ + Iy_temp = Iy_row[0]; \ + Iy_row[0] = -DBL_MAX; \ + kA = sA[nA-1]; \ + for (j = 1; j < nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \ + M_temp = M_row[j]; \ + M_row[j] = score; \ + Ix_temp = Ix_row[j]; \ + Ix_row[j] = 0; \ + gaps[nA][j].Ix = 0; \ + Iy_temp = Iy_row[j]; \ + Iy_row[j] = 0; \ + gaps[nA][j].Iy = 0; \ + } \ + kB = sB[nB-1]; \ + SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \ + gaps[nA][nB].Ix = 0; \ + gaps[nA][nB].Iy = 0; \ +\ + PyMem_Free(M_row); \ + PyMem_Free(Ix_row); \ + PyMem_Free(Iy_row); \ +\ + /* As we don't allow zero-score extensions to alignments, \ + * we need to remove all traces towards an ENDPOINT. \ + * In addition, some points then won't have any path to a STARTPOINT. \ + * Here, use path as a temporary variable to indicate if the point \ + * is reachable from a STARTPOINT. If it is unreachable, remove all \ + * traces from it, and don't allow it to be an ENDPOINT. It may still \ + * be a valid STARTPOINT. */ \ + for (j = 0; j <= nB; j++) M[0][j].path = M_MATRIX; \ + for (i = 1; i <= nA; i++) { \ + M[i][0].path = M_MATRIX; \ + for (j = 1; j <= nB; j++) { \ + /* Remove traces to unreachable points. */ \ + trace = M[i][j].trace; \ + if (!(M[i-1][j-1].path & M_MATRIX)) trace &= ~M_MATRIX; \ + if (!(M[i-1][j-1].path & Ix_MATRIX)) trace &= ~Ix_MATRIX; \ + if (!(M[i-1][j-1].path & Iy_MATRIX)) trace &= ~Iy_MATRIX; \ + if (trace & (STARTPOINT | M_MATRIX | Ix_MATRIX | Iy_MATRIX)) { \ + /* The point is reachable. */ \ + if (trace & ENDPOINT) M[i][j].path = 0; /* no extensions after ENDPOINT */ \ + else M[i][j].path |= M_MATRIX; \ + } \ + else { \ + /* The point is not reachable. Then it is not a STARTPOINT, \ + * all traces from it can be removed, and it cannot act as \ + * an ENDPOINT. */ \ + M[i][j].path &= ~M_MATRIX; \ + trace = 0; \ + } \ + M[i][j].trace = trace; \ + trace = gaps[i][j].Ix; \ + if (!(M[i-1][j].path & M_MATRIX)) trace &= ~M_MATRIX; \ + if (!(M[i-1][j].path & Ix_MATRIX)) trace &= ~Ix_MATRIX; \ + if (!(M[i-1][j].path & Iy_MATRIX)) trace &= ~Iy_MATRIX; \ + if (trace & (M_MATRIX | Ix_MATRIX | Iy_MATRIX)) { \ + /* The point is reachable. */ \ + M[i][j].path |= Ix_MATRIX; \ + } \ + else { \ + /* The point is not reachable. Then \ + * all traces from it can be removed. */ \ + M[i][j].path &= ~Ix_MATRIX; \ + trace = 0; \ + } \ + gaps[i][j].Ix = trace; \ + trace = gaps[i][j].Iy; \ + if (!(M[i][j-1].path & M_MATRIX)) trace &= ~M_MATRIX; \ + if (!(M[i][j-1].path & Ix_MATRIX)) trace &= ~Ix_MATRIX; \ + if (!(M[i][j-1].path & Iy_MATRIX)) trace &= ~Iy_MATRIX; \ + if (trace & (M_MATRIX | Ix_MATRIX | Iy_MATRIX)) { \ + /* The point is reachable. */ \ + M[i][j].path |= Iy_MATRIX; \ + } \ + else { \ + /* The point is not reachable. Then \ + * all traces from it can be removed. */ \ + M[i][j].path &= ~Iy_MATRIX; \ + trace = 0; \ + } \ + gaps[i][j].Iy = trace; \ + } \ + } \ +\ + /* traceback */ \ + if (maximum == 0) M[0][0].path = DONE; \ + else M[0][0].path = 0; \ + return Py_BuildValue("fN", maximum, paths); \ +\ +exit: \ + Py_DECREF(paths); \ + if (M_row) PyMem_Free(M_row); \ + if (Ix_row) PyMem_Free(Ix_row); \ + if (Iy_row) PyMem_Free(Iy_row); \ + return PyErr_NoMemory(); \ + + +#define WATERMANSMITHBEYER_ENTER_SCORE \ + int i; \ + int j = 0; \ + int k; \ + int kA; \ + int kB; \ + double** M = NULL; \ + double** Ix = NULL; \ + double** Iy = NULL; \ + double score = 0.0; \ + double gapscore = 0.0; \ + double temp; \ + int ok = 1; \ + PyObject* result = NULL; \ +\ + /* Waterman-Smith-Beyer algorithm */ \ + M = PyMem_Malloc((nA+1)*sizeof(double*)); \ + if (!M) goto exit; \ + Ix = PyMem_Malloc((nA+1)*sizeof(double*)); \ + if (!Ix) goto exit; \ + Iy = PyMem_Malloc((nA+1)*sizeof(double*)); \ + if (!Iy) goto exit; \ + for (i = 0; i <= nA; i++) { \ + M[i] = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!M[i]) goto exit; \ + Ix[i] = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Ix[i]) goto exit; \ + Iy[i] = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Iy[i]) goto exit; \ + } \ + + +#define WATERMANSMITHBEYER_GLOBAL_SCORE(align_score, query_gap_start) \ + /* The top row of the score matrix is a special case, \ + * as there are no previously aligned characters. \ + */ \ + M[0][0] = 0; \ + Ix[0][0] = -DBL_MAX; \ + Iy[0][0] = -DBL_MAX; \ + for (i = 1; i <= nA; i++) { \ + M[i][0] = -DBL_MAX; \ + Iy[i][0] = -DBL_MAX; \ + ok = _call_query_gap_function(self, query_gap_start, i, &score); \ + if (!ok) goto exit; \ + Ix[i][0] = score; \ + } \ + for (j = 1; j <= nB; j++) { \ + M[0][j] = -DBL_MAX; \ + Ix[0][j] = -DBL_MAX; \ + ok = _call_target_gap_function(self, 0, j, &score); \ + if (!ok) goto exit; \ + Iy[0][j] = score; \ + } \ + for (i = 1; i <= nA; i++) { \ + kA = sA[i-1]; \ + for (j = 1; j <= nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_GLOBAL(M[i-1][j-1], Ix[i-1][j-1], Iy[i-1][j-1]); \ + M[i][j] = score + (align_score); \ + score = -DBL_MAX; \ + for (k = 1; k <= i; k++) { \ + ok = _call_query_gap_function(self, query_gap_start, k, &gapscore); \ + if (!ok) goto exit; \ + SELECT_SCORE_WATERMAN_SMITH_BEYER(M[i-k][j], Iy[i-k][j]); \ + } \ + Ix[i][j] = score; \ + score = -DBL_MAX; \ + for (k = 1; k <= j; k++) { \ + ok = _call_target_gap_function(self, i, k, &gapscore); \ + if (!ok) goto exit; \ + SELECT_SCORE_WATERMAN_SMITH_BEYER(M[i][j-k], Ix[i][j-k]); \ + } \ + Iy[i][j] = score; \ + } \ + } \ + SELECT_SCORE_GLOBAL(M[nA][nB], Ix[nA][nB], Iy[nA][nB]); \ +\ + result = PyFloat_FromDouble(score); \ + + +#define WATERMANSMITHBEYER_LOCAL_SCORE(align_score, query_gap_start) \ + /* The top row of the score matrix is a special case, \ + * as there are no previously aligned characters. \ + */ \ + M[0][0] = 0; \ + Ix[0][0] = -DBL_MAX; \ + Iy[0][0] = -DBL_MAX; \ + for (i = 1; i <= nA; i++) { \ + M[i][0] = -DBL_MAX; \ + Ix[i][0] = 0; \ + Iy[i][0] = -DBL_MAX; \ + } \ + for (j = 1; j <= nB; j++) { \ + M[0][j] = -DBL_MAX; \ + Ix[0][j] = -DBL_MAX; \ + Iy[0][j] = 0; \ + } \ + for (i = 1; i <= nA; i++) { \ + kA = sA[i-1]; \ + for (j = 1; j <= nB; j++) { \ + kB = sB[j-1]; \ + SELECT_SCORE_GOTOH_LOCAL_ALIGN(M[i-1][j-1], \ + Ix[i-1][j-1], \ + Iy[i-1][j-1], \ + (align_score)); \ + M[i][j] = score; \ + if (i == nA || j == nB) { \ + Ix[i][j] = 0; \ + Iy[i][j] = 0; \ + continue; \ + } \ + score = 0.0; \ + for (k = 1; k <= i; k++) { \ + ok = _call_query_gap_function(self, query_gap_start, k, &gapscore); \ + SELECT_SCORE_WATERMAN_SMITH_BEYER(M[i-k][j], Iy[i-k][j]); \ + if (!ok) goto exit; \ + } \ + if (score > maximum) maximum = score; \ + Ix[i][j] = score; \ + score = 0.0; \ + for (k = 1; k <= j; k++) { \ + ok = _call_target_gap_function(self, i, k, &gapscore); \ + if (!ok) goto exit; \ + SELECT_SCORE_WATERMAN_SMITH_BEYER(M[i][j-k], Ix[i][j-k]); \ + } \ + if (score > maximum) maximum = score; \ + Iy[i][j] = score; \ + } \ + } \ + SELECT_SCORE_GLOBAL(M[nA][nB], Ix[nA][nB], Iy[nA][nB]); \ + if (score > maximum) maximum = score; \ + result = PyFloat_FromDouble(maximum); \ + + +#define WATERMANSMITHBEYER_EXIT_SCORE \ +exit: \ + if (M) { \ + /* If M is NULL, then Ix is also NULL. */ \ + if (Ix) { \ + /* If Ix is NULL, then Iy is also NULL. */ \ + if (Iy) { \ + /* If Iy is NULL, then M[i], Ix[i], and Iy[i] are \ + * also NULL. */ \ + for (i = 0; i <= nA; i++) { \ + if (!M[i]) break; \ + PyMem_Free(M[i]); \ + if (!Ix[i]) break; \ + PyMem_Free(Ix[i]); \ + if (!Iy[i]) break; \ + PyMem_Free(Iy[i]); \ + } \ + PyMem_Free(Iy); \ + } \ + PyMem_Free(Ix); \ + } \ + PyMem_Free(M); \ + } \ + if (!ok) return NULL; \ + if (!result) return PyErr_NoMemory(); \ + return result; \ + + +#define WATERMANSMITHBEYER_ENTER_ALIGN(mode) \ + int i; \ + int j = 0; \ + int gap; \ + int kA; \ + int kB; \ + const double epsilon = self->epsilon; \ + Trace** M; \ + TraceGapsWatermanSmithBeyer** gaps; \ + double** M_row; \ + double** Ix_row; \ + double** Iy_row; \ + int ng; \ + int nm; \ + double score; \ + double gapscore; \ + double temp; \ + int trace; \ + int* gapM; \ + int* gapXY; \ + int ok = 1; \ + PathGenerator* paths = NULL; \ + \ + /* Waterman-Smith-Beyer algorithm */ \ + paths = PathGenerator_create_WSB(nA, nB, mode, strand); \ + if (!paths) return NULL; \ + M = paths->M; \ + gaps = paths->gaps.waterman_smith_beyer; \ + M_row = PyMem_Malloc((nA+1)*sizeof(double*)); \ + if (!M_row) goto exit; \ + Ix_row = PyMem_Malloc((nA+1)*sizeof(double*)); \ + if (!Ix_row) goto exit; \ + Iy_row = PyMem_Malloc((nA+1)*sizeof(double*)); \ + if (!Iy_row) goto exit; \ + for (i = 0; i <= nA; i++) { \ + M_row[i] = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!M_row[i]) goto exit; \ + Ix_row[i] = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Ix_row[i]) goto exit; \ + Iy_row[i] = PyMem_Malloc((nB+1)*sizeof(double)); \ + if (!Iy_row[i]) goto exit; \ + } \ + + +#define WATERMANSMITHBEYER_GLOBAL_ALIGN(align_score, query_gap_start) \ + M_row[0][0] = 0; \ + Ix_row[0][0] = -DBL_MAX; \ + Iy_row[0][0] = -DBL_MAX; \ + for (i = 1; i <= nA; i++) { \ + M_row[i][0] = -DBL_MAX; \ + Iy_row[i][0] = -DBL_MAX; \ + ok = _call_query_gap_function(self, query_gap_start, i, &score); \ + if (!ok) goto exit; \ + Ix_row[i][0] = score; \ + } \ + for (j = 1; j <= nB; j++) { \ + M_row[0][j] = -DBL_MAX; \ + Ix_row[0][j] = -DBL_MAX; \ + ok = _call_target_gap_function(self, query_gap_start, j, &score); \ + if (!ok) goto exit; \ + Iy_row[0][j] = score; \ + } \ + for (i = 1; i <= nA; i++) { \ + kA = sA[i-1]; \ + for (j = 1; j <= nB; j++) { \ + kB = sB[j-1]; \ + SELECT_TRACE_WATERMAN_SMITH_BEYER_GLOBAL_ALIGN((align_score)); \ + gapM = PyMem_Malloc((i+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gaps[i][j].MIx = gapM; \ + gapXY = PyMem_Malloc((i+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gaps[i][j].IyIx = gapXY; \ + nm = 0; \ + ng = 0; \ + score = -DBL_MAX; \ + for (gap = 1; gap <= i; gap++) { \ + ok = _call_query_gap_function(self, query_gap_start, gap, &gapscore); \ + if (!ok) goto exit; \ + SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(M_row[i-gap][j], \ + Iy_row[i-gap][j]); \ + } \ + gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gaps[i][j].MIx = gapM; \ + gapM[nm] = 0; \ + gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gapXY[ng] = 0; \ + gaps[i][j].IyIx = gapXY; \ + Ix_row[i][j] = score; \ + gapM = PyMem_Malloc((j+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gaps[i][j].MIy = gapM; \ + gapXY = PyMem_Malloc((j+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gaps[i][j].IxIy = gapXY; \ + nm = 0; \ + ng = 0; \ + score = -DBL_MAX; \ + for (gap = 1; gap <= j; gap++) { \ + ok = _call_target_gap_function(self, i, gap, &gapscore); \ + if (!ok) goto exit; \ + SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(M_row[i][j-gap], \ + Ix_row[i][j-gap]); \ + } \ + Iy_row[i][j] = score; \ + gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gaps[i][j].MIy = gapM; \ + gapM[nm] = 0; \ + gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gaps[i][j].IxIy = gapXY; \ + gapXY[ng] = 0; \ + } \ + } \ + /* traceback */ \ + SELECT_SCORE_GLOBAL(M_row[nA][nB], Ix_row[nA][nB], Iy_row[nA][nB]); \ + M[nA][nB].path = 0; \ + if (M_row[nA][nB] < score - epsilon) M[nA][nB].trace = 0; \ + if (Ix_row[nA][nB] < score - epsilon) { \ + gapM = PyMem_Realloc(gaps[nA][nB].MIx, sizeof(int)); \ + if (!gapM) goto exit; \ + gapM[0] = 0; \ + gaps[nA][nB].MIx = gapM; \ + gapXY = PyMem_Realloc(gaps[nA][nB].IyIx, sizeof(int)); \ + if (!gapXY) goto exit; \ + gapXY[0] = 0; \ + gaps[nA][nB].IyIx = gapXY; \ + } \ + if (Iy_row[nA][nB] < score - epsilon) { \ + gapM = PyMem_Realloc(gaps[nA][nB].MIy, sizeof(int)); \ + if (!gapM) goto exit; \ + gapM[0] = 0; \ + gaps[nA][nB].MIy = gapM; \ + gapXY = PyMem_Realloc(gaps[nA][nB].IxIy, sizeof(int)); \ + if (!gapXY) goto exit; \ + gapXY[0] = 0; \ + gaps[nA][nB].IxIy = gapXY; \ + } \ + for (i = 0; i <= nA; i++) { \ + PyMem_Free(M_row[i]); \ + PyMem_Free(Ix_row[i]); \ + PyMem_Free(Iy_row[i]); \ + } \ + PyMem_Free(M_row); \ + PyMem_Free(Ix_row); \ + PyMem_Free(Iy_row); \ + return Py_BuildValue("fN", score, paths); \ + + +#define WATERMANSMITHBEYER_LOCAL_ALIGN(align_score, query_gap_start) \ + M_row[0][0] = 0; \ + Ix_row[0][0] = -DBL_MAX; \ + Iy_row[0][0] = -DBL_MAX; \ + for (i = 1; i <= nA; i++) { \ + M_row[i][0] = 0; \ + Ix_row[i][0] = -DBL_MAX; \ + Iy_row[i][0] = -DBL_MAX; \ + } \ + for (i = 1; i <= nB; i++) { \ + M_row[0][i] = 0; \ + Ix_row[0][i] = -DBL_MAX; \ + Iy_row[0][i] = -DBL_MAX; \ + } \ + for (i = 1; i <= nA; i++) { \ + kA = sA[i-1]; \ + for (j = 1; j <= nB; j++) { \ + kB = sB[j-1]; \ + nm = 0; \ + ng = 0; \ + SELECT_TRACE_WATERMAN_SMITH_BEYER_ALIGN( \ + M_row[i-1][j-1], \ + Ix_row[i-1][j-1], \ + Iy_row[i-1][j-1], \ + (align_score)); \ + M[i][j].path = 0; \ + if (i == nA || j == nB) { \ + Ix_row[i][j] = score; \ + gaps[i][j].MIx = NULL; \ + gaps[i][j].IyIx = NULL; \ + gaps[i][j].MIy = NULL; \ + gaps[i][j].IxIy = NULL; \ + Iy_row[i][j] = score; \ + continue; \ + } \ + gapM = PyMem_Malloc((i+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gaps[i][j].MIx = gapM; \ + gapXY = PyMem_Malloc((i+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gaps[i][j].IyIx = gapXY; \ + score = -DBL_MAX; \ + for (gap = 1; gap <= i; gap++) { \ + ok = _call_query_gap_function(self, query_gap_start, gap, &gapscore); \ + if (!ok) goto exit; \ + SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(M_row[i-gap][j], \ + Iy_row[i-gap][j]); \ + } \ + if (score < epsilon) { \ + score = -DBL_MAX; \ + nm = 0; \ + ng = 0; \ + } \ + else if (score > maximum) maximum = score; \ + gapM[nm] = 0; \ + gapXY[ng] = 0; \ + Ix_row[i][j] = score; \ + M[i][j].path = 0; \ + gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gaps[i][j].MIx = gapM; \ + gapM[nm] = 0; \ + gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gaps[i][j].IyIx = gapXY; \ + gapXY[ng] = 0; \ + gapM = PyMem_Malloc((j+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gaps[i][j].MIy = gapM; \ + gapXY = PyMem_Malloc((j+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gaps[i][j].IxIy = gapXY; \ + nm = 0; \ + ng = 0; \ + score = -DBL_MAX; \ + gapM[0] = 0; \ + for (gap = 1; gap <= j; gap++) { \ + ok = _call_target_gap_function(self, i, gap, &gapscore); \ + if (!ok) goto exit; \ + SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(M_row[i][j-gap], \ + Ix_row[i][j-gap]); \ + } \ + if (score < epsilon) { \ + score = -DBL_MAX; \ + nm = 0; \ + ng = 0; \ + } \ + else if (score > maximum) maximum = score; \ + gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gaps[i][j].MIy = gapM; \ + gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gaps[i][j].IxIy = gapXY; \ + gapM[nm] = 0; \ + gapXY[ng] = 0; \ + Iy_row[i][j] = score; \ + M[i][j].path = 0; \ + } \ + } \ + for (i = 0; i <= nA; i++) PyMem_Free(M_row[i]); \ + PyMem_Free(M_row); \ + for (i = 0; i <= nA; i++) PyMem_Free(Ix_row[i]); \ + PyMem_Free(Ix_row); \ + for (i = 0; i <= nA; i++) PyMem_Free(Iy_row[i]); \ + PyMem_Free(Iy_row); \ +\ + /* As we don't allow zero-score extensions to alignments, \ + * we need to remove all traces towards an ENDPOINT. \ + * In addition, some points then won't have any path to a STARTPOINT. \ + * Here, use path as a temporary variable to indicate if the point \ + * is reachable from a STARTPOINT. If it is unreachable, remove all \ + * traces from it, and don't allow it to be an ENDPOINT. It may still \ + * be a valid STARTPOINT. */ \ + for (j = 0; j <= nB; j++) M[0][j].path = M_MATRIX; \ + for (i = 1; i <= nA; i++) { \ + M[i][0].path = M_MATRIX; \ + for (j = 1; j <= nB; j++) { \ + /* Remove traces to unreachable points. */ \ + trace = M[i][j].trace; \ + if (!(M[i-1][j-1].path & M_MATRIX)) trace &= ~M_MATRIX; \ + if (!(M[i-1][j-1].path & Ix_MATRIX)) trace &= ~Ix_MATRIX; \ + if (!(M[i-1][j-1].path & Iy_MATRIX)) trace &= ~Iy_MATRIX; \ + if (trace & (STARTPOINT | M_MATRIX | Ix_MATRIX | Iy_MATRIX)) { \ + /* The point is reachable. */ \ + if (trace & ENDPOINT) M[i][j].path = 0; /* no extensions after ENDPOINT */ \ + else M[i][j].path |= M_MATRIX; \ + } \ + else { \ + /* The point is not reachable. Then it is not a STARTPOINT, \ + * all traces from it can be removed, and it cannot act as \ + * an ENDPOINT. */ \ + M[i][j].path &= ~M_MATRIX; \ + trace = 0; \ + } \ + M[i][j].trace = trace; \ + if (i == nA || j == nB) continue; \ + gapM = gaps[i][j].MIx; \ + gapXY = gaps[i][j].IyIx; \ + nm = 0; \ + ng = 0; \ + for (im = 0; (gap = gapM[im]); im++) \ + if (M[i-gap][j].path & M_MATRIX) gapM[nm++] = gap; \ + gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gapM[nm] = 0; \ + gaps[i][j].MIx = gapM; \ + for (im = 0; (gap = gapXY[im]); im++) \ + if (M[i-gap][j].path & Iy_MATRIX) gapXY[ng++] = gap; \ + gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gapXY[ng] = 0; \ + gaps[i][j].IyIx = gapXY; \ + if (nm==0 && ng==0) M[i][j].path &= ~Ix_MATRIX; /* not reachable */ \ + else M[i][j].path |= Ix_MATRIX; /* reachable */ \ + gapM = gaps[i][j].MIy; \ + gapXY = gaps[i][j].IxIy; \ + nm = 0; \ + ng = 0; \ + for (im = 0; (gap = gapM[im]); im++) \ + if (M[i][j-gap].path & M_MATRIX) gapM[nm++] = gap; \ + gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \ + if (!gapM) goto exit; \ + gapM[nm] = 0; \ + gaps[i][j].MIy = gapM; \ + for (im = 0; (gap = gapXY[im]); im++) \ + if (M[i][j-gap].path & Ix_MATRIX) gapXY[ng++] = gap; \ + gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \ + if (!gapXY) goto exit; \ + gapXY[ng] = 0; \ + gaps[i][j].IxIy = gapXY; \ + if (nm==0 && ng==0) M[i][j].path &= ~Iy_MATRIX; /* not reachable */ \ + else M[i][j].path |= Iy_MATRIX; /* reachable */ \ + } \ + } \ + /* traceback */ \ + if (maximum == 0) M[0][0].path = DONE; \ + else M[0][0].path = 0; \ + return Py_BuildValue("fN", maximum, paths); \ + + +#define WATERMANSMITHBEYER_EXIT_ALIGN \ +exit: \ + if (ok) /* otherwise, an exception was already set */ \ + PyErr_SetNone(PyExc_MemoryError); \ + Py_DECREF(paths); \ + if (M_row) { \ + /* If M is NULL, then Ix is also NULL. */ \ + if (Ix_row) { \ + /* If Ix is NULL, then Iy is also NULL. */ \ + if (Iy_row) { \ + /* If Iy is NULL, then M[i], Ix[i], and Iy[i] are also NULL. */ \ + for (i = 0; i <= nA; i++) { \ + if (!M_row[i]) break; \ + PyMem_Free(M_row[i]); \ + if (!Ix_row[i]) break; \ + PyMem_Free(Ix_row[i]); \ + if (!Iy_row[i]) break; \ + PyMem_Free(Iy_row[i]); \ + } \ + PyMem_Free(Iy_row); \ + } \ + PyMem_Free(Ix_row); \ + } \ + PyMem_Free(M_row); \ + } \ + return NULL; \ + + +/* -------------- allocation & deallocation ------------- */ + +static PathGenerator* +PathGenerator_create_NWSW(Py_ssize_t nA, Py_ssize_t nB, Mode mode, unsigned char strand) +{ + int i; + unsigned char trace = 0; + Trace** M; + PathGenerator* paths; + + paths = (PathGenerator*)PyType_GenericAlloc(&PathGenerator_Type, 0); + if (!paths) return NULL; + + paths->iA = 0; + paths->iB = 0; + paths->nA = nA; + paths->nB = nB; + paths->M = NULL; + paths->gaps.gotoh = NULL; + paths->gaps.waterman_smith_beyer = NULL; + paths->algorithm = NeedlemanWunschSmithWaterman; + paths->mode = mode; + paths->length = 0; + paths->strand = strand; + + M = PyMem_Malloc((nA+1)*sizeof(Trace*)); + paths->M = M; + if (!M) goto exit; + switch (mode) { + case Global: trace = VERTICAL; break; + case Local: trace = STARTPOINT; break; + } + for (i = 0; i <= nA; i++) { + M[i] = PyMem_Malloc((nB+1)*sizeof(Trace)); + if (!M[i]) goto exit; + M[i][0].trace = trace; + } + if (mode == Global) { + M[0][0].trace = 0; + trace = HORIZONTAL; + } + for (i = 1; i <= nB; i++) M[0][i].trace = trace; + M[0][0].path = 0; + return paths; +exit: + Py_DECREF(paths); + PyErr_SetNone(PyExc_MemoryError); + return NULL; +} + +static PathGenerator* +PathGenerator_create_Gotoh(Py_ssize_t nA, Py_ssize_t nB, Mode mode, unsigned char strand) +{ + int i; + unsigned char trace; + Trace** M; + TraceGapsGotoh** gaps; + PathGenerator* paths; + + switch (mode) { + case Global: trace = 0; break; + case Local: trace = STARTPOINT; break; + default: + /* Should not happen, but the compiler has no way of knowing that, + * as the enum Mode does not restrict the value of mode, which can + * be any integer. Include default: here to prevent compiler + * warnings. + */ + PyErr_Format(PyExc_RuntimeError, + "mode has unexpected value %d", mode); + return NULL; + } + + paths = (PathGenerator*)PyType_GenericAlloc(&PathGenerator_Type, 0); + if (!paths) return NULL; + + paths->iA = 0; + paths->iB = 0; + paths->nA = nA; + paths->nB = nB; + paths->M = NULL; + paths->gaps.gotoh = NULL; + paths->algorithm = Gotoh; + paths->mode = mode; + paths->length = 0; + paths->strand = strand; + + M = PyMem_Malloc((nA+1)*sizeof(Trace*)); + if (!M) goto exit; + paths->M = M; + for (i = 0; i <= nA; i++) { + M[i] = PyMem_Malloc((nB+1)*sizeof(Trace)); + if (!M[i]) goto exit; + M[i][0].trace = trace; + } + gaps = PyMem_Malloc((nA+1)*sizeof(TraceGapsGotoh*)); + if (!gaps) goto exit; + paths->gaps.gotoh = gaps; + for (i = 0; i <= nA; i++) { + gaps[i] = PyMem_Malloc((nB+1)*sizeof(TraceGapsGotoh)); + if (!gaps[i]) goto exit; + } + + gaps[0][0].Ix = 0; + gaps[0][0].Iy = 0; + if (mode == Global) { + for (i = 1; i <= nA; i++) { + gaps[i][0].Ix = Ix_MATRIX; + gaps[i][0].Iy = 0; + } + gaps[1][0].Ix = M_MATRIX; + for (i = 1; i <= nB; i++) { + M[0][i].trace = 0; + gaps[0][i].Ix = 0; + gaps[0][i].Iy = Iy_MATRIX; + } + gaps[0][1].Iy = M_MATRIX; + } + else if (mode == Local) { + for (i = 1; i < nA; i++) { + gaps[i][0].Ix = 0; + gaps[i][0].Iy = 0; + } + for (i = 1; i <= nB; i++) { + M[0][i].trace = trace; + gaps[0][i].Ix = 0; + gaps[0][i].Iy = 0; + } + } + M[0][0].path = 0; + + return paths; +exit: + Py_DECREF(paths); + PyErr_SetNone(PyExc_MemoryError); + return NULL; +} + +static PathGenerator* +PathGenerator_create_WSB(Py_ssize_t nA, Py_ssize_t nB, Mode mode, unsigned char strand) +{ + int i, j; + int* trace; + Trace** M = NULL; + TraceGapsWatermanSmithBeyer** gaps = NULL; + PathGenerator* paths; + + paths = (PathGenerator*)PyType_GenericAlloc(&PathGenerator_Type, 0); + if (!paths) return NULL; + + paths->iA = 0; + paths->iB = 0; + paths->nA = nA; + paths->nB = nB; + paths->M = NULL; + paths->gaps.waterman_smith_beyer = NULL; + paths->algorithm = WatermanSmithBeyer; + paths->mode = mode; + paths->length = 0; + paths->strand = strand; + + M = PyMem_Malloc((nA+1)*sizeof(Trace*)); + if (!M) goto exit; + paths->M = M; + for (i = 0; i <= nA; i++) { + M[i] = PyMem_Malloc((nB+1)*sizeof(Trace)); + if (!M[i]) goto exit; + } + gaps = PyMem_Malloc((nA+1)*sizeof(TraceGapsWatermanSmithBeyer*)); + if (!gaps) goto exit; + paths->gaps.waterman_smith_beyer = gaps; + for (i = 0; i <= nA; i++) gaps[i] = NULL; + for (i = 0; i <= nA; i++) { + gaps[i] = PyMem_Malloc((nB+1)*sizeof(TraceGapsWatermanSmithBeyer)); + if (!gaps[i]) goto exit; + for (j = 0; j <= nB; j++) { + gaps[i][j].MIx = NULL; + gaps[i][j].IyIx = NULL; + gaps[i][j].MIy = NULL; + gaps[i][j].IxIy = NULL; + } + M[i][0].path = 0; + switch (mode) { + case Global: + M[i][0].trace = 0; + trace = PyMem_Malloc(2*sizeof(int)); + if (!trace) goto exit; + gaps[i][0].MIx = trace; + trace[0] = i; + trace[1] = 0; + trace = PyMem_Malloc(sizeof(int)); + if (!trace) goto exit; + gaps[i][0].IyIx = trace; + trace[0] = 0; + break; + case Local: + M[i][0].trace = STARTPOINT; + break; + } + } + for (i = 1; i <= nB; i++) { + switch (mode) { + case Global: + M[0][i].trace = 0; + trace = PyMem_Malloc(2*sizeof(int)); + if (!trace) goto exit; + gaps[0][i].MIy = trace; + trace[0] = i; + trace[1] = 0; + trace = PyMem_Malloc(sizeof(int)); + if (!trace) goto exit; + gaps[0][i].IxIy = trace; + trace[0] = 0; + break; + case Local: + M[0][i].trace = STARTPOINT; + break; + } + } + M[0][0].path = 0; + return paths; +exit: + Py_DECREF(paths); + PyErr_SetNone(PyExc_MemoryError); + return NULL; +} + +/* ----------------- alignment algorithms ----------------- */ + +#define MATRIX_SCORE scores[kA*n+kB] +#define COMPARE_SCORE (kA == wildcard || kB == wildcard) ? 0 : (kA == kB) ? match : mismatch + + +static PyObject* +Aligner_needlemanwunsch_score_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + NEEDLEMANWUNSCH_SCORE(COMPARE_SCORE); +} + +static PyObject* +Aligner_needlemanwunsch_score_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + NEEDLEMANWUNSCH_SCORE(MATRIX_SCORE); +} + +static PyObject* +Aligner_smithwaterman_score_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + SMITHWATERMAN_SCORE(COMPARE_SCORE); +} + +static PyObject* +Aligner_smithwaterman_score_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + SMITHWATERMAN_SCORE(MATRIX_SCORE); +} + +static PyObject* +Aligner_needlemanwunsch_align_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + NEEDLEMANWUNSCH_ALIGN(COMPARE_SCORE); +} + +static PyObject* +Aligner_needlemanwunsch_align_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + NEEDLEMANWUNSCH_ALIGN(MATRIX_SCORE); +} + +static PyObject* +Aligner_smithwaterman_align_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + SMITHWATERMAN_ALIGN(COMPARE_SCORE); +} + +static PyObject* +Aligner_smithwaterman_align_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + SMITHWATERMAN_ALIGN(MATRIX_SCORE); +} + +static PyObject* +Aligner_gotoh_global_score_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + GOTOH_GLOBAL_SCORE(COMPARE_SCORE); +} + +static PyObject* +Aligner_gotoh_global_score_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + GOTOH_GLOBAL_SCORE(MATRIX_SCORE); +} + +static PyObject* +Aligner_gotoh_local_score_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + GOTOH_LOCAL_SCORE(COMPARE_SCORE); +} + +static PyObject* +Aligner_gotoh_local_score_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + GOTOH_LOCAL_SCORE(MATRIX_SCORE); +} + +static PyObject* +Aligner_gotoh_global_align_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + GOTOH_GLOBAL_ALIGN(COMPARE_SCORE); +} + +static PyObject* +Aligner_gotoh_global_align_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + GOTOH_GLOBAL_ALIGN(MATRIX_SCORE); +} + +static PyObject* +Aligner_gotoh_local_align_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + GOTOH_LOCAL_ALIGN(COMPARE_SCORE); +} + +static PyObject* +Aligner_gotoh_local_align_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + GOTOH_LOCAL_ALIGN(MATRIX_SCORE); +} + +static int +_call_query_gap_function(Aligner* aligner, int i, int j, double* score) +{ + double value; + PyObject* result; + PyObject* function = aligner->query_gap_function; + if (!function) + value = aligner->query_internal_open_gap_score + + (j-1) * aligner->query_internal_extend_gap_score; + else { + result = PyObject_CallFunction(function, "ii", i, j); + if (result == NULL) return 0; + value = PyFloat_AsDouble(result); + Py_DECREF(result); + if (value == -1.0 && PyErr_Occurred()) return 0; + } + *score = value; + return 1; +} + +static int +_call_target_gap_function(Aligner* aligner, int i, int j, double* score) +{ + double value; + PyObject* result; + PyObject* function = aligner->target_gap_function; + if (!function) + value = aligner->target_internal_open_gap_score + + (j-1) * aligner->target_internal_extend_gap_score; + else { + result = PyObject_CallFunction(function, "ii", i, j); + if (result == NULL) return 0; + value = PyFloat_AsDouble(result); + Py_DECREF(result); + if (value == -1.0 && PyErr_Occurred()) return 0; + } + *score = value; + return 1; +} + +static PyObject* +Aligner_watermansmithbeyer_global_score_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + WATERMANSMITHBEYER_ENTER_SCORE; + switch (strand) { + case '+': { + WATERMANSMITHBEYER_GLOBAL_SCORE(COMPARE_SCORE, j); + break; + } + case '-': { + WATERMANSMITHBEYER_GLOBAL_SCORE(COMPARE_SCORE, nB-j); + break; + } + } + WATERMANSMITHBEYER_EXIT_SCORE; +} + +static PyObject* +Aligner_watermansmithbeyer_global_score_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + WATERMANSMITHBEYER_ENTER_SCORE; + switch (strand) { + case '+': + WATERMANSMITHBEYER_GLOBAL_SCORE(MATRIX_SCORE, j); + break; + case '-': + WATERMANSMITHBEYER_GLOBAL_SCORE(MATRIX_SCORE, nB-j); + break; + } + WATERMANSMITHBEYER_EXIT_SCORE; +} + +static PyObject* +Aligner_watermansmithbeyer_local_score_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + double maximum = 0.0; + WATERMANSMITHBEYER_ENTER_SCORE; + switch (strand) { + case '+': { + WATERMANSMITHBEYER_LOCAL_SCORE(COMPARE_SCORE, j); + break; + } + case '-': { + WATERMANSMITHBEYER_LOCAL_SCORE(COMPARE_SCORE, nB-j); + break; + } + } + WATERMANSMITHBEYER_EXIT_SCORE; +} + +static PyObject* +Aligner_watermansmithbeyer_local_score_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + double maximum = 0.0; + WATERMANSMITHBEYER_ENTER_SCORE; + switch (strand) { + case '+': { + WATERMANSMITHBEYER_LOCAL_SCORE(MATRIX_SCORE, j); + break; + } + case '-': { + WATERMANSMITHBEYER_LOCAL_SCORE(MATRIX_SCORE, nB-j); + break; + } + } + WATERMANSMITHBEYER_EXIT_SCORE; +} + +static PyObject* +Aligner_watermansmithbeyer_global_align_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + WATERMANSMITHBEYER_ENTER_ALIGN(Global); + switch (strand) { + case '+': { + WATERMANSMITHBEYER_GLOBAL_ALIGN(COMPARE_SCORE, j); + break; + } + case '-': { + WATERMANSMITHBEYER_GLOBAL_ALIGN(COMPARE_SCORE, nB-j); + break; + } + } + WATERMANSMITHBEYER_EXIT_ALIGN; +} + +static PyObject* +Aligner_watermansmithbeyer_global_align_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + WATERMANSMITHBEYER_ENTER_ALIGN(Global); + switch (strand) { + case '+': { + WATERMANSMITHBEYER_GLOBAL_ALIGN(MATRIX_SCORE, j); + break; + } + case '-': { + WATERMANSMITHBEYER_GLOBAL_ALIGN(MATRIX_SCORE, nB-j); + break; + } + } + WATERMANSMITHBEYER_EXIT_ALIGN; +} + +static PyObject* +Aligner_watermansmithbeyer_local_align_compare(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const double match = self->match; + const double mismatch = self->mismatch; + const int wildcard = self->wildcard; + int im = nA; + int jm = nB; + double maximum = 0; + WATERMANSMITHBEYER_ENTER_ALIGN(Local); + switch (strand) { + case '+': { + WATERMANSMITHBEYER_LOCAL_ALIGN(COMPARE_SCORE, j); + break; + } + case '-': { + WATERMANSMITHBEYER_LOCAL_ALIGN(COMPARE_SCORE, nB-j); + break; + } + } + WATERMANSMITHBEYER_EXIT_ALIGN; +} + +static PyObject* +Aligner_watermansmithbeyer_local_align_matrix(Aligner* self, + const int* sA, Py_ssize_t nA, + const int* sB, Py_ssize_t nB, + unsigned char strand) +{ + const Py_ssize_t n = self->substitution_matrix.shape[0]; + const double* scores = self->substitution_matrix.buf; + int im = nA; + int jm = nB; + double maximum = 0; + WATERMANSMITHBEYER_ENTER_ALIGN(Local); + switch (strand) { + case '+': { + WATERMANSMITHBEYER_LOCAL_ALIGN(MATRIX_SCORE, j); + break; + } + case '-': { + WATERMANSMITHBEYER_LOCAL_ALIGN(MATRIX_SCORE, nB-j); + break; + } + } + WATERMANSMITHBEYER_EXIT_ALIGN; +} + +static int* +convert_1bytes_to_ints(const int mapping[], Py_ssize_t n, const unsigned char s[]) +{ + unsigned char c; + Py_ssize_t i; + int index; + int* indices; + if (n == 0) { + PyErr_SetString(PyExc_ValueError, "sequence has zero length"); + return NULL; + } + indices = PyMem_Malloc(n*sizeof(int)); + if (!indices) { + PyErr_NoMemory(); + return NULL; + } + if (!mapping) for (i = 0; i < n; i++) indices[i] = s[i]; + else { + for (i = 0; i < n; i++) { + c = s[i]; + index = mapping[(int)c]; + if (index == MISSING_LETTER) { + PyErr_SetString(PyExc_ValueError, + "sequence contains letters not in the alphabet"); + PyMem_Free(indices); + return NULL; + } + indices[i] = index; + } + } + return indices; +} + +static int* +convert_2bytes_to_ints(const int mapping[], Py_ssize_t n, const Py_UCS2 s[]) +{ + unsigned char c; + Py_ssize_t i; + int index; + int* indices; + if (n == 0) { + PyErr_SetString(PyExc_ValueError, "sequence has zero length"); + return NULL; + } + indices = PyMem_Malloc(n*sizeof(int)); + if (!indices) { + PyErr_NoMemory(); + return NULL; + } + if (!mapping) for (i = 0; i < n; i++) indices[i] = s[i]; + else { + for (i = 0; i < n; i++) { + c = s[i]; + index = mapping[(int)c]; + if (index == MISSING_LETTER) { + PyErr_SetString(PyExc_ValueError, + "sequence contains letters not in the alphabet"); + PyMem_Free(indices); + return NULL; + } + indices[i] = index; + } + } + return indices; +} + +static int* +convert_4bytes_to_ints(const int mapping[], Py_ssize_t n, const Py_UCS4 s[]) +{ + unsigned char c; + Py_ssize_t i; + int index; + int* indices; + if (n == 0) { + PyErr_SetString(PyExc_ValueError, "sequence has zero length"); + return NULL; + } + indices = PyMem_Malloc(n*sizeof(int)); + if (!indices) { + PyErr_NoMemory(); + return NULL; + } + if (!mapping) for (i = 0; i < n; i++) indices[i] = s[i]; + else { + for (i = 0; i < n; i++) { + c = s[i]; + index = mapping[(int)c]; + if (index == MISSING_LETTER) { + PyErr_SetString(PyExc_ValueError, + "sequence contains letters not in the alphabet"); + PyMem_Free(indices); + return NULL; + } + indices[i] = index; + } + } + return indices; +} + +static int +convert_objects_to_ints(Py_buffer* view, PyObject* alphabet, PyObject* sequence) +{ + Py_ssize_t i, j; + Py_ssize_t n; + Py_ssize_t m; + int* indices = NULL; + PyObject *obj1, *obj2; + int equal; + + view->buf = NULL; + sequence = PySequence_Fast(sequence, + "argument should support the sequence protocol"); + if (!sequence) return 0; + if (!alphabet) { + PyErr_SetString(PyExc_ValueError, + "alphabet is None; cannot interpret sequence"); + goto exit; + } + alphabet = PySequence_Fast(alphabet, NULL); /* should never fail */ + n = PySequence_Size(sequence); + m = PySequence_Size(alphabet); + indices = PyMem_Malloc(n*sizeof(int)); + if (!indices) { + PyErr_NoMemory(); + goto exit; + } + for (i = 0; i < n; i++) { + obj1 = PySequence_Fast_GET_ITEM(sequence, i); + for (j = 0; j < m; j++) { + obj2 = PySequence_Fast_GET_ITEM(alphabet, j); + equal = PyObject_RichCompareBool(obj1, obj2, Py_EQ); + if (equal == 1) /* obj1 == obj2 */ { + indices[i] = j; + break; + } + else if (equal == -1) /* error */ { + PyMem_Del(indices); + goto exit; + } + /* else (equal == 0) continue; */ /* not equal */ + } + if (j == m) { + PyErr_SetString(PyExc_ValueError, "failed to find object in alphabet"); + goto exit; + } + } + view->buf = indices; + view->itemsize = 1; + view->len = n; +exit: + Py_DECREF(sequence); + Py_XDECREF(alphabet); + if (view->buf) return 1; + return 0; +} + +static int +sequence_converter(PyObject* argument, void* pointer) +{ + Py_buffer* view = pointer; + Py_ssize_t i; + Py_ssize_t n; + int index; + int* indices; + const int flag = PyBUF_FORMAT | PyBUF_C_CONTIGUOUS; + Aligner* aligner; + int* mapping; + + if (argument == NULL) { + if (view->obj) PyBuffer_Release(view); + else { + indices = view->buf; + PyMem_Free(indices); + } + return 1; + } + + aligner = (Aligner*)view->obj; + view->obj = NULL; + + if (PyObject_GetBuffer(argument, view, flag) == 0) { + if (view->ndim != 1) { + PyErr_Format(PyExc_ValueError, + "sequence has incorrect rank (%d expected 1)", view->ndim); + return 0; + } + n = view->len / view->itemsize; + if (n == 0) { + PyErr_SetString(PyExc_ValueError, "sequence has zero length"); + return 0; + } + if (strcmp(view->format, "c") == 0 || strcmp(view->format, "B") == 0) { + if (view->itemsize != sizeof(char)) { + PyErr_Format(PyExc_ValueError, + "sequence has unexpected item byte size " + "(%ld, expected %ld)", view->itemsize, sizeof(char)); + return 0; + } + indices = convert_1bytes_to_ints(aligner->mapping, n, view->buf); + if (!indices) return 0; + PyBuffer_Release(view); + view->itemsize = 1; + view->len = n; + view->buf = indices; + return Py_CLEANUP_SUPPORTED; + } + if (strcmp(view->format, "i") == 0 || strcmp(view->format, "l") == 0) { + if (view->itemsize != sizeof(int)) { + PyErr_Format(PyExc_ValueError, + "sequence has unexpected item byte size " + "(%ld, expected %ld)", view->itemsize, sizeof(int)); + return 0; + } + indices = view->buf; + if (aligner->substitution_matrix.obj) { + const Py_ssize_t m = aligner->substitution_matrix.shape[0]; + for (i = 0; i < n; i++) { + index = indices[i]; + if (index < 0) { + PyErr_Format(PyExc_ValueError, + "sequence item %zd is negative (%d)", + i, index); + return 0; + } + if (index >= m) { + PyErr_Format(PyExc_ValueError, + "sequence item %zd is out of bound" + " (%d, should be < %zd)", i, index, m); + return 0; + } + } + } + return Py_CLEANUP_SUPPORTED; + } + PyErr_Format(PyExc_ValueError, + "sequence has incorrect data type '%s'", view->format); + return 0; + } + PyErr_Clear(); /* To clear the exception raised by PyObject_GetBuffer */ + mapping = aligner->mapping; + if (PyUnicode_Check(argument)) { + if (PyUnicode_READY(argument) == -1) return 0; + n = PyUnicode_GET_LENGTH(argument); + switch (PyUnicode_KIND(argument)) { + case PyUnicode_1BYTE_KIND: { + Py_UCS1* s = PyUnicode_1BYTE_DATA(argument); + indices = convert_1bytes_to_ints(mapping, n, (unsigned char*)s); + break; + } + case PyUnicode_2BYTE_KIND: { + Py_UCS2* s = PyUnicode_2BYTE_DATA(argument); + indices = convert_2bytes_to_ints(mapping, n, s); + break; + } + case PyUnicode_4BYTE_KIND: { + Py_UCS4* s = PyUnicode_4BYTE_DATA(argument); + indices = convert_4bytes_to_ints(mapping, n, s); + break; + } + case PyUnicode_WCHAR_KIND: + default: + PyErr_SetString(PyExc_ValueError, "could not interpret unicode data"); + return 0; + } + if (!indices) return 0; + view->buf = indices; + view->itemsize = 1; + view->len = n; + return Py_CLEANUP_SUPPORTED; + } + + if (!mapping) { + if (!convert_objects_to_ints(view, aligner->alphabet, argument)) return 0; + return Py_CLEANUP_SUPPORTED; + } + + PyErr_SetString(PyExc_ValueError, "sequence has unexpected format"); + return 0; +} + +static int +strand_converter(PyObject* argument, void* pointer) +{ + if (!PyUnicode_Check(argument)) goto error; + if (PyUnicode_READY(argument) == -1) return 0; + if (PyUnicode_GET_LENGTH(argument) == 1) { + const Py_UCS4 ch = PyUnicode_READ_CHAR(argument, 0); + if (ch < 128) { + const char c = ch; + if (ch == '+' || ch == '-') { + *((char*)pointer) = c; + return 1; + } + } + } +error: + PyErr_SetString(PyExc_ValueError, "strand must be '+' or '-'"); + return 0; +} + +static const char Aligner_score__doc__[] = "calculates the alignment score"; + +static PyObject* +Aligner_score(Aligner* self, PyObject* args, PyObject* keywords) +{ + const int* sA; + const int* sB; + Py_ssize_t nA; + Py_ssize_t nB; + Py_buffer bA = {0}; + Py_buffer bB = {0}; + const Mode mode = self->mode; + const Algorithm algorithm = _get_algorithm(self); + char strand = '+'; + PyObject* result = NULL; + PyObject* substitution_matrix = self->substitution_matrix.obj; + + static char *kwlist[] = {"sequenceA", "sequenceB", "strand", NULL}; + + bA.obj = (PyObject*)self; + bB.obj = (PyObject*)self; + if(!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&", kwlist, + sequence_converter, &bA, + sequence_converter, &bB, + strand_converter, &strand)) + return NULL; + + sA = bA.buf; + nA = bA.len / bA.itemsize; + sB = bB.buf; + nB = bB.len / bB.itemsize; + + switch (algorithm) { + case NeedlemanWunschSmithWaterman: + switch (mode) { + case Global: + if (substitution_matrix) + result = Aligner_needlemanwunsch_score_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_needlemanwunsch_score_compare(self, sA, nA, sB, nB, strand); + break; + case Local: + if (substitution_matrix) + result = Aligner_smithwaterman_score_matrix(self, sA, nA, sB, nB); + else + result = Aligner_smithwaterman_score_compare(self, sA, nA, sB, nB); + break; + } + break; + case Gotoh: + switch (mode) { + case Global: + if (substitution_matrix) + result = Aligner_gotoh_global_score_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_gotoh_global_score_compare(self, sA, nA, sB, nB, strand); + break; + case Local: + if (substitution_matrix) + result = Aligner_gotoh_local_score_matrix(self, sA, nA, sB, nB); + else + result = Aligner_gotoh_local_score_compare(self, sA, nA, sB, nB); + break; + } + break; + case WatermanSmithBeyer: + switch (mode) { + case Global: + if (substitution_matrix) + result = Aligner_watermansmithbeyer_global_score_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_watermansmithbeyer_global_score_compare(self, sA, nA, sB, nB, strand); + break; + case Local: + if (substitution_matrix) + result = Aligner_watermansmithbeyer_local_score_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_watermansmithbeyer_local_score_compare(self, sA, nA, sB, nB, strand); + break; + } + break; + case Unknown: + default: + PyErr_SetString(PyExc_RuntimeError, "unknown algorithm"); + break; + } + + sequence_converter(NULL, &bA); + sequence_converter(NULL, &bB); + + return result; +} + +static const char Aligner_align__doc__[] = "align two sequences"; + +static PyObject* +Aligner_align(Aligner* self, PyObject* args, PyObject* keywords) +{ + const int* sA; + const int* sB; + Py_ssize_t nA; + Py_ssize_t nB; + Py_buffer bA = {0}; + Py_buffer bB = {0}; + const Mode mode = self->mode; + const Algorithm algorithm = _get_algorithm(self); + char strand = '+'; + PyObject* result = NULL; + PyObject* substitution_matrix = self->substitution_matrix.obj; + + static char *kwlist[] = {"sequenceA", "sequenceB", "strand", NULL}; + + bA.obj = (PyObject*)self; + bB.obj = (PyObject*)self; + if(!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&", kwlist, + sequence_converter, &bA, + sequence_converter, &bB, + strand_converter, &strand)) + return NULL; + + sA = bA.buf; + nA = bA.len / bA.itemsize; + sB = bB.buf; + nB = bB.len / bB.itemsize; + + switch (algorithm) { + case NeedlemanWunschSmithWaterman: + switch (mode) { + case Global: + if (substitution_matrix) + result = Aligner_needlemanwunsch_align_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_needlemanwunsch_align_compare(self, sA, nA, sB, nB, strand); + break; + case Local: + if (substitution_matrix) + result = Aligner_smithwaterman_align_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_smithwaterman_align_compare(self, sA, nA, sB, nB, strand); + break; + } + break; + case Gotoh: + switch (mode) { + case Global: + if (substitution_matrix) + result = Aligner_gotoh_global_align_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_gotoh_global_align_compare(self, sA, nA, sB, nB, strand); + break; + case Local: + if (substitution_matrix) + result = Aligner_gotoh_local_align_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_gotoh_local_align_compare(self, sA, nA, sB, nB, strand); + break; + } + break; + case WatermanSmithBeyer: + switch (mode) { + case Global: + if (substitution_matrix) + result = Aligner_watermansmithbeyer_global_align_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_watermansmithbeyer_global_align_compare(self, sA, nA, sB, nB, strand); + break; + case Local: + if (substitution_matrix) + result = Aligner_watermansmithbeyer_local_align_matrix(self, sA, nA, sB, nB, strand); + else + result = Aligner_watermansmithbeyer_local_align_compare(self, sA, nA, sB, nB, strand); + break; + } + break; + case Unknown: + default: + PyErr_SetString(PyExc_RuntimeError, "unknown algorithm"); + break; + } + + sequence_converter(NULL, &bA); + sequence_converter(NULL, &bB); + + return result; +} + +static char Aligner_doc[] = +"Aligner.\n"; + +static PyMethodDef Aligner_methods[] = { + {"score", + (PyCFunction)Aligner_score, + METH_VARARGS | METH_KEYWORDS, + Aligner_score__doc__ + }, + {"align", + (PyCFunction)Aligner_align, + METH_VARARGS | METH_KEYWORDS, + Aligner_align__doc__ + }, + {NULL} /* Sentinel */ +}; + +static PyTypeObject AlignerType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_algorithms.PairwiseAligner", /* tp_name */ + sizeof(Aligner), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Aligner_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + (reprfunc)Aligner_repr, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + (reprfunc)Aligner_str, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + Aligner_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Aligner_methods, /* tp_methods */ + 0, /* tp_members */ + Aligner_getset, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Aligner_init, /* tp_init */ +}; + + +/* Module definition */ + +static char _aligners__doc__[] = +"C extension module implementing pairwise alignment algorithms"; + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "_aligners", + _aligners__doc__, + -1, + NULL, + NULL, + NULL, + NULL, + NULL +}; + +PyObject * +PyInit__aligners(void) +{ + PyObject* module; + AlignerType.tp_new = PyType_GenericNew; + + if (PyType_Ready(&AlignerType) < 0 || PyType_Ready(&PathGenerator_Type) < 0) + return NULL; + + module = PyModule_Create(&moduledef); + if (!module) return NULL; + + Py_INCREF(&AlignerType); + /* Reference to AlignerType will be stolen by PyModule_AddObject + * only if it is successful. */ + if (PyModule_AddObject(module, + "PairwiseAligner", (PyObject*) &AlignerType) < 0) { + Py_DECREF(&AlignerType); + Py_DECREF(module); + return NULL; + } + + return module; +} diff --git a/code/lib/Bio/Align/_aligners.cp37-win_amd64.pyd b/code/lib/Bio/Align/_aligners.cp37-win_amd64.pyd new file mode 100644 index 0000000..26d918c Binary files /dev/null and b/code/lib/Bio/Align/_aligners.cp37-win_amd64.pyd differ diff --git a/code/lib/Bio/Align/substitution_matrices/__init__.py b/code/lib/Bio/Align/substitution_matrices/__init__.py new file mode 100644 index 0000000..5d49ac6 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/__init__.py @@ -0,0 +1,514 @@ +# Copyright 2019 by Michiel de Hoon. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Substitution matrices.""" + +import os +import string +import numpy + + +class Array(numpy.ndarray): + """numpy array subclass indexed by integers and by letters.""" + + def __new__(cls, alphabet=None, dims=None, data=None, dtype=float): + """Create a new Array instance.""" + if isinstance(data, dict): + if alphabet is not None: + raise ValueError("alphabet should be None if data is a dict") + if dims is not None: + raise ValueError("dims should be None if data is a dict") + alphabet = [] + for key in data: + if isinstance(key, str): + if dims is None: + dims = 1 + elif dims != 1: + raise ValueError("inconsistent dimensions in data") + alphabet.append(key) + elif isinstance(key, tuple): + single_letters = True + if dims is None: + dims = len(key) + elif dims != len(key): + raise ValueError("inconsistent dimensions in data") + if dims == 1: + if not isinstance(key, str): + raise ValueError("expected string") + if len(key) > 1: + single_letters = False + alphabet.append(key) + elif dims == 2: + for letter in key: + if not isinstance(letter, str): + raise ValueError("expected string") + if len(letter) > 1: + single_letters = False + alphabet.append(letter) + else: + raise ValueError( + "data array should be 1- or 2- dimensional " + "(found %d dimensions) in key" % dims + ) + alphabet = sorted(set(alphabet)) + if single_letters: + alphabet = "".join(alphabet) + else: + alphabet = tuple(alphabet) + n = len(alphabet) + if dims == 1: + shape = (n,) + elif dims == 2: + shape = (n, n) + else: # dims is None + raise ValueError("data is an empty dictionary") + obj = super().__new__(cls, shape, dtype) + if dims == 1: + for i, key in enumerate(alphabet): + obj[i] = data.get(letter, 0.0) + elif dims == 2: + for i1, letter1 in enumerate(alphabet): + for i2, letter2 in enumerate(alphabet): + key = (letter1, letter2) + value = data.get(key, 0.0) + obj[i1, i2] = value + obj._alphabet = alphabet + return obj + if alphabet is None: + alphabet = string.ascii_uppercase + elif not (isinstance(alphabet, str) or isinstance(alphabet, tuple)): + raise ValueError("alphabet should be a string or a tuple") + n = len(alphabet) + if data is None: + if dims is None: + dims = 1 + elif dims not in (1, 2): + raise ValueError("dims should be 1 or 2 (found %s)" % dims) + shape = (n,) * dims + else: + if dims is None: + shape = data.shape + dims = len(shape) + if dims == 1: + pass + elif dims == 2: + if shape[0] != shape[1]: + raise ValueError("data array is not square") + else: + raise ValueError( + "data array should be 1- or 2- dimensional " + "(found %d dimensions) " % dims + ) + else: + shape = (n,) * dims + if data.shape != shape: + raise ValueError( + "data shape has inconsistent shape (expected (%s), found (%s))" + % (shape, data.shape) + ) + obj = super().__new__(cls, shape, dtype) + if data is None: + obj[:] = 0.0 + else: + obj[:] = data + obj._alphabet = alphabet + return obj + + def __array_finalize__(self, obj): + if obj is None: + return + self._alphabet = getattr(obj, "_alphabet", None) + + def _convert_key(self, key): + if isinstance(key, tuple): + indices = [] + for index in key: + if isinstance(index, str): + try: + index = self._alphabet.index(index) + except ValueError: + raise IndexError("'%s'" % index) from None + indices.append(index) + key = tuple(indices) + elif isinstance(key, str): + try: + key = self._alphabet.index(key) + except ValueError: + raise IndexError("'%s'" % key) from None + return key + + def __getitem__(self, key): + key = self._convert_key(key) + value = numpy.ndarray.__getitem__(self, key) + if value.ndim == 2: + if self.ndim == 2: + if value.shape != self.shape: + raise IndexError("Requesting truncated array") + elif self.ndim == 1: + length = self.shape[0] + if value.shape[0] == length and value.shape[1] == 1: + pass + elif value.shape[0] == 1 and value.shape[1] == length: + pass + else: + raise IndexError("Requesting truncated array") + elif value.ndim == 1: + if value.shape[0] != self.shape[0]: + value._alphabet = self.alphabet[key] + return value.view(Array) + + def __setitem__(self, key, value): + key = self._convert_key(key) + numpy.ndarray.__setitem__(self, key, value) + + def __contains__(self, key): + # Follow dict definition of __contains__ + return key in self.keys() + + def __array_prepare__(self, out_arr, context=None): + # needed for numpy older than 1.13.0 + ufunc, inputs, i = context + alphabet = self.alphabet + for arg in inputs: + if isinstance(arg, Array): + if arg.alphabet != alphabet: + raise ValueError("alphabets are inconsistent") + return numpy.ndarray.__array_prepare__(self, out_arr, context) + + def __array_wrap__(self, out_arr, context=None): + if len(out_arr) == 1: + return out_arr[0] + return numpy.ndarray.__array_wrap__(self, out_arr, context) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + args = [] + alphabet = self._alphabet + for arg in inputs: + if isinstance(arg, Array): + if arg.alphabet != alphabet: + raise ValueError("alphabets are inconsistent") + args.append(arg.view(numpy.ndarray)) + else: + args.append(arg) + + outputs = kwargs.pop("out", None) + if outputs: + out_args = [] + for arg in outputs: + if isinstance(arg, Array): + if arg.alphabet != alphabet: + raise ValueError("alphabets are inconsistent") + out_args.append(arg.view(numpy.ndarray)) + else: + out_args.append(arg) + kwargs["out"] = tuple(out_args) + else: + outputs = (None,) * ufunc.nout + + raw_results = super().__array_ufunc__(ufunc, method, *args, **kwargs) + if raw_results is NotImplemented: + return NotImplemented + + if method == "at": + return + + if ufunc.nout == 1: + raw_results = (raw_results,) + + results = [] + for raw_result, output in zip(raw_results, outputs): + if raw_result.ndim == 0: + result = raw_result + elif output is None: + result = numpy.asarray(raw_result).view(Array) + result._alphabet = self._alphabet + else: + result = output + result._alphabet = self._alphabet + results.append(result) + + return results[0] if len(results) == 1 else results + + def __reduce__(self): + import pickle + + values = numpy.array(self) + state = pickle.dumps(values) + alphabet = self._alphabet + dims = len(self.shape) + dtype = self.dtype + arguments = (Array, alphabet, dims, None, dtype) + return (Array.__new__, arguments, state) + + def __setstate__(self, state): + import pickle + + self[:, :] = pickle.loads(state) + + def transpose(self, axes=None): + """Transpose the array.""" + other = numpy.ndarray.transpose(self, axes) + other._alphabet = self._alphabet + return other + + @property + def alphabet(self): + """Return the alphabet property.""" + return self._alphabet + + def copy(self): + """Create and return a copy of the array.""" + other = Array(alphabet=self._alphabet, data=self) + return other + + def get(self, key, value=None): + """Return the value of the key if found; return value otherwise.""" + try: + return self[key] + except IndexError: + return value + + def items(self): + """Return an iterator of (key, value) pairs in the array.""" + dims = len(self.shape) + if dims == 1: + for index, key in enumerate(self._alphabet): + value = numpy.ndarray.__getitem__(self, index) + yield key, value + elif dims == 2: + for i1, c1 in enumerate(self._alphabet): + for i2, c2 in enumerate(self._alphabet): + key = (c1, c2) + value = numpy.ndarray.__getitem__(self, (i1, i2)) + yield key, value + else: + raise RuntimeError("array has unexpected shape %s" % self.shape) + + def keys(self): + """Return a tuple with the keys associated with the array.""" + dims = len(self.shape) + alphabet = self._alphabet + if dims == 1: + return tuple(alphabet) + elif dims == 2: + return tuple((c1, c2) for c2 in alphabet for c1 in alphabet) + else: + raise RuntimeError("array has unexpected shape %s" % self.shape) + + def values(self): + """Return a tuple with the values stored in the array.""" + dims = len(self.shape) + alphabet = self._alphabet + if dims == 1: + return tuple(self) + elif dims == 2: + n1, n2 = self.shape + return tuple( + numpy.ndarray.__getitem__(self, (i1, i2)) + for i2 in range(n2) + for i1 in range(n1) + ) + else: + raise RuntimeError("array has unexpected shape %s" % self.shape) + + def update(self, E=None, **F): + """Update the array from dict/iterable E and F.""" + if E is not None: + try: + alphabet = E.keys() + except AttributeError: + for key, value in E: + self[key] = value + else: + for key in E: + self[key] = E[key] + for key in F: + self[key] = F[key] + + def select(self, alphabet): + """Subset the array by selecting the letters from the specified alphabet.""" + ii = [] + jj = [] + for i, key in enumerate(alphabet): + try: + j = self._alphabet.index(key) + except ValueError: + continue + ii.append(i) + jj.append(j) + dims = len(self.shape) + a = Array(alphabet, dims=dims) + ii = numpy.ix_(*[ii] * dims) + jj = numpy.ix_(*[jj] * dims) + a[ii] = numpy.ndarray.__getitem__(self, jj) + return a + + def _format_1D(self, fmt): + _alphabet = self._alphabet + n = len(_alphabet) + words = [None] * n + lines = [] + try: + header = self.header + except AttributeError: + pass + else: + for line in header: + line = "# %s\n" % line + lines.append(line) + maxwidth = 0 + for i, key in enumerate(_alphabet): + value = self[key] + word = fmt % value + width = len(word) + if width > maxwidth: + maxwidth = width + words[i] = word + fmt2 = " %" + str(maxwidth) + "s" + for letter, word in zip(_alphabet, words): + word = fmt2 % word + line = letter + word + "\n" + lines.append(line) + text = "".join(lines) + return text + + def _format_2D(self, fmt): + alphabet = self.alphabet + n = len(alphabet) + words = [[None] * n for _ in range(n)] + lines = [] + try: + header = self.header + except AttributeError: + pass + else: + for line in header: + line = "# %s\n" % line + lines.append(line) + width = max(len(c) for c in alphabet) + line = " " * width + for j, c2 in enumerate(alphabet): + maxwidth = 0 + for i, c1 in enumerate(alphabet): + key = (c1, c2) + value = self[key] + word = fmt % value + width = len(word) + if width > maxwidth: + maxwidth = width + words[i][j] = word + fmt2 = " %" + str(maxwidth) + "s" + word = fmt2 % c2 + line += word + for i, c1 in enumerate(alphabet): + word = words[i][j] + words[i][j] = fmt2 % word + line = line.rstrip() + "\n" + lines.append(line) + for letter, row in zip(alphabet, words): + line = letter + "".join(row) + "\n" + lines.append(line) + text = "".join(lines) + return text + + def __format__(self, fmt): + return self.format(fmt) + + def format(self, fmt=""): + """Return a string representation of the array. + + The argument ``fmt`` specifies the number format to be used. + By default, the number format is "%i" if the array contains integer + numbers, and "%.1f" otherwise. + + """ + if fmt == "": + if numpy.issubdtype(self.dtype, numpy.integer): + fmt = "%i" + else: + fmt = "%.1f" + n = len(self.shape) + if n == 1: + return self._format_1D(fmt) + elif n == 2: + return self._format_2D(fmt) + else: + raise RuntimeError("Array has unexpected rank %d" % n) + + def __str__(self): + return self.format() + + def __repr__(self): + text = numpy.ndarray.__repr__(self) + alphabet = self._alphabet + if isinstance(alphabet, str): + assert text.endswith(")") + text = text[:-1] + ",\n alphabet='%s')" % self._alphabet + return text + + +def read(handle, dtype=float): + """Parse the file and return an Array object.""" + try: + fp = open(handle) + lines = fp.readlines() + except TypeError: + fp = handle + try: + lines = fp.readlines() + except Exception as e: + raise e from None + finally: + fp.close() + header = [] + for i, line in enumerate(lines): + if not line.startswith("#"): + break + header.append(line[1:].strip()) + rows = [line.split() for line in lines[i:]] + if len(rows[0]) == len(rows[1]) == 2: + alphabet = [key for key, value in rows] + for key in alphabet: + if len(key) > 1: + alphabet = tuple(alphabet) + break + else: + alphabet = "".join(alphabet) + matrix = Array(alphabet=alphabet, dims=1, dtype=dtype) + matrix.update(rows) + else: + alphabet = rows.pop(0) + for key in alphabet: + if len(key) > 1: + alphabet = tuple(alphabet) + break + else: + alphabet = "".join(alphabet) + matrix = Array(alphabet=alphabet, dims=2, dtype=dtype) + for letter1, row in zip(alphabet, rows): + assert letter1 == row.pop(0) + for letter2, word in zip(alphabet, row): + matrix[letter1, letter2] = float(word) + matrix.header = header + return matrix + + +def load(name=None): + """Load and return a precalculated substitution matrix. + + >>> from Bio.Align import substitution_matrices + >>> names = substitution_matrices.load() + """ + path = os.path.realpath(__file__) + directory = os.path.dirname(path) + subdirectory = os.path.join(directory, "data") + if name is None: + filenames = os.listdir(subdirectory) + return sorted(filenames) + path = os.path.join(subdirectory, name) + matrix = read(path) + return matrix diff --git a/code/lib/Bio/Align/substitution_matrices/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Align/substitution_matrices/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..23f023b Binary files /dev/null and b/code/lib/Bio/Align/substitution_matrices/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Align/substitution_matrices/data/BENNER22 b/code/lib/Bio/Align/substitution_matrices/data/BENNER22 new file mode 100644 index 0000000..49ba457 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/BENNER22 @@ -0,0 +1,27 @@ +# S.A. Benner, M.A. Cohen, G.H. Gonnet: +# "Amino acid substitution during functionally constrained divergent evolution +# of protein sequences". +# Protein Engineering 7(11): 1323-1332 (1994). +# Figure 3B. +# PMID 7700864 + A C D E F G H I K L M N P Q R S T V W Y +A 2.5 -1.2 -0.2 -0.3 -3.1 0.8 -1.6 -0.4 -1.0 -1.7 -0.8 0.0 0.8 -0.9 -1.2 1.3 1.4 0.4 -5.5 -3.5 +C -1.2 12.6 -3.7 -4.3 -0.1 -1.7 -1.5 -2.4 -3.3 -2.6 -2.5 -1.9 -3.1 -3.3 -1.6 0.3 -1.1 -1.7 0.5 0.6 +D -0.2 -3.7 4.8 3.9 -5.4 0.7 0.3 -4.0 0.2 -4.9 -3.9 2.4 -1.8 0.6 -1.0 0.1 -0.7 -3.0 -6.4 -3.0 +E -0.3 -4.3 3.9 4.6 -5.7 0.5 -0.2 -3.6 1.0 -4.4 -3.4 1.2 -1.7 1.7 -0.1 -0.5 -0.9 -2.7 -6.3 -4.0 +F -3.1 -0.1 -5.4 -5.7 7.7 -5.8 0.3 0.5 -5.1 2.2 0.7 -3.5 -3.4 -3.6 -4.3 -2.2 -2.6 -0.1 0.5 5.9 +G 0.8 -1.7 0.7 0.5 -5.8 6.2 -2.0 -3.8 -1.0 -4.9 -3.8 0.4 -1.8 -1.4 -0.7 0.6 -0.7 -2.5 -4.5 -4.8 +H -1.6 -1.5 0.3 -0.2 0.3 -2.0 6.1 -3.2 0.8 -2.1 -2.4 1.4 -0.4 2.4 1.5 -0.5 -1.1 -3.0 -2.7 3.7 +I -0.4 -2.4 -4.0 -3.6 0.5 -3.8 -3.2 4.2 -3.0 2.7 3.1 -2.7 -2.3 -2.7 -3.2 -1.4 0.3 3.6 -4.4 -2.2 +K -1.0 -3.3 0.2 1.0 -5.1 -1.0 0.8 -3.0 4.4 -3.3 -2.0 1.0 -1.6 2.2 3.9 -0.4 -0.4 -2.7 -3.7 -3.6 +L -1.7 -2.6 -4.9 -4.4 2.2 -4.9 -2.1 2.7 -3.3 4.6 3.2 -3.5 -1.3 -2.0 -2.9 -2.1 -1.0 2.0 -1.8 -0.7 +M -0.8 -2.5 -3.9 -3.4 0.7 -3.8 -2.4 3.1 -2.0 3.2 4.9 -2.6 -2.0 -1.7 -2.1 -1.5 0.1 2.5 -2.8 -1.8 +N 0.0 -1.9 2.4 1.2 -3.5 0.4 1.4 -2.7 1.0 -3.5 -2.6 3.3 -1.1 0.5 0.4 1.1 0.5 -2.3 -5.2 -1.2 +P 0.8 -3.1 -1.8 -1.7 -3.4 -1.8 -0.4 -2.3 -1.6 -1.3 -2.0 -1.1 7.0 -0.1 -1.2 1.1 0.4 -1.7 -5.8 -3.5 +Q -0.9 -3.3 0.6 1.7 -3.6 -1.4 2.4 -2.7 2.2 -2.0 -1.7 0.5 -0.1 4.2 2.2 -0.6 -0.7 -2.4 -3.3 -1.9 +R -1.2 -1.6 -1.0 -0.1 -4.3 -0.7 1.5 -3.2 3.9 -2.9 -2.1 0.4 -1.2 2.2 5.0 -0.5 -0.7 -2.9 -1.1 -2.7 +S 1.3 0.3 0.1 -0.5 -2.2 0.6 -0.5 -1.4 -0.4 -2.1 -1.5 1.1 1.1 -0.6 -0.5 2.0 1.5 -0.9 -3.9 -1.9 +T 1.4 -1.1 -0.7 -0.9 -2.6 -0.7 -1.1 0.3 -0.4 -1.0 0.1 0.5 0.4 -0.7 -0.7 1.5 2.5 0.4 -4.5 -3.0 +V 0.4 -1.7 -3.0 -2.7 -0.1 -2.5 -3.0 3.6 -2.7 2.0 2.5 -2.3 -1.7 -2.4 -2.9 -0.9 0.4 3.7 -4.5 -2.6 +W -5.5 0.5 -6.4 -6.3 0.5 -4.5 -2.7 -4.4 -3.7 -1.8 -2.8 -5.2 -5.8 -3.3 -1.1 -3.9 -4.5 -4.5 15.7 1.5 +Y -3.5 0.6 -3.0 -4.0 5.9 -4.8 3.7 -2.2 -3.6 -0.7 -1.8 -1.2 -3.5 -1.9 -2.7 -1.9 -3.0 -2.6 1.5 9.0 diff --git a/code/lib/Bio/Align/substitution_matrices/data/BENNER6 b/code/lib/Bio/Align/substitution_matrices/data/BENNER6 new file mode 100644 index 0000000..4849b30 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/BENNER6 @@ -0,0 +1,27 @@ +# S.A. Benner, M.A. Cohen, G.H. Gonnet: +# "Amino acid substitution during functionally constrained divergent evolution +# of protein sequences". +# Protein Engineering 7(11): 1323-1332 (1994). +# Figure 3A. +# PMID 7700864 + A C D E F G H I K L M N P Q R S T V W Y +A 2.5 -1.7 -0.6 -0.7 -3.2 0.8 -2.1 0.1 -1.9 -1.3 -0.2 0.0 1.1 -1.7 -1.7 1.4 1.7 0.7 -4.3 -4.0 +C -1.7 12.1 -3.7 -4.7 -0.1 -1.3 -1.2 -3.6 -2.8 -3.8 -3.7 -1.6 -2.7 -3.2 -0.4 0.9 -1.5 -3.1 1.6 2.6 +D -0.6 -3.7 5.2 4.4 -5.7 0.8 0.1 -4.2 -0.2 -5.3 -4.3 2.5 -2.8 0.6 -1.5 -0.4 -1.2 -3.3 -6.3 -2.3 +E -0.7 -4.7 4.4 5.2 -6.7 0.5 -0.2 -4.1 0.9 -5.0 -4.1 1.1 -2.6 2.1 -0.4 -1.2 -1.6 -3.0 -5.6 -4.1 +F -3.2 -0.1 -5.7 -6.7 8.3 -5.7 0.1 0.0 -6.3 2.4 -0.1 -3.5 -3.2 -4.4 -4.9 -1.8 -2.4 -0.5 -1.6 5.6 +G 0.8 -1.3 0.8 0.5 -5.7 5.8 -2.1 -3.4 -1.4 -4.6 -3.7 -0.1 -1.7 -1.6 -0.1 0.8 -0.5 -2.3 -1.7 -4.9 +H -2.1 -1.2 0.1 -0.2 0.1 -2.1 6.1 -3.7 0.9 -2.2 -3.4 1.4 -0.4 3.2 1.8 -0.9 -1.7 -3.8 -2.8 4.4 +I 0.1 -3.6 -4.2 -4.1 0.0 -3.4 -3.7 4.4 -3.8 2.4 4.0 -2.5 -2.0 -3.8 -3.8 -1.2 0.7 3.9 -5.0 -3.3 +K -1.9 -2.8 -0.2 0.9 -6.3 -1.4 0.9 -3.8 5.6 -4.1 -2.9 1.0 -2.3 2.5 4.3 -1.2 -1.1 -3.8 -1.4 -4.0 +L -1.3 -3.8 -5.3 -5.0 2.4 -4.6 -2.2 2.4 -4.1 4.8 -2.9 -3.4 -0.2 -2.4 -3.2 -1.5 -0.4 1.9 -3.0 -1.6 +M -0.2 -3.7 -4.3 -4.1 -0.1 -3.7 -3.4 4.0 -2.9 -2.9 4.8 -2.5 -1.8 -3.1 -3.0 -1.3 0.6 3.3 -4.4 -3.6 +N 0.0 -1.6 2.5 1.1 -3.5 -0.1 1.4 -2.5 1.0 -3.4 -2.5 3.6 -1.1 0.1 -0.1 1.2 0.5 -2.4 -4.4 -0.9 +P 1.1 -2.7 -2.8 -2.6 -3.2 -1.7 -0.4 -2.0 -2.3 -0.2 -1.8 -1.1 6.5 0.1 -1.3 1.4 0.6 -1.6 -4.8 -3.8 +Q -1.7 -3.2 0.6 2.1 -4.4 -1.6 3.2 -3.8 2.5 -2.4 -3.1 0.1 0.1 5.3 2.5 -1.4 -1.7 -3.5 -2.6 -1.4 +R -1.7 -0.4 -1.5 -0.4 -4.9 -0.1 1.8 -3.8 4.3 -3.2 -3.0 -0.1 -1.3 2.5 5.1 -0.9 -1.3 -3.7 2.0 -2.6 +S 1.4 0.9 -0.4 -1.2 -1.8 0.8 -0.9 -1.2 -1.2 -1.5 -1.3 1.2 1.4 -1.4 -0.9 2.1 1.5 -0.9 -2.9 -1.8 +T 1.7 -1.5 -1.2 -1.6 -2.4 -0.5 -1.7 0.7 -1.1 -0.4 0.6 0.5 0.6 -1.7 -1.3 1.5 2.4 0.6 -2.6 -3.4 +V 0.7 -3.1 -3.3 -3.0 -0.5 -2.3 -3.8 3.9 -3.8 1.9 3.3 -2.4 -1.6 -3.5 -3.7 -0.9 0.6 4.0 -4.8 -3.8 +W -4.3 1.6 -6.3 -5.6 -1.6 -1.7 -2.8 -5.0 -1.4 -3.0 -4.4 -4.4 -4.8 -2.6 2.0 -2.9 -2.6 -4.8 14.7 -0.3 +Y -4.0 2.6 -2.3 -4.1 5.6 -4.9 4.4 -3.3 -4.0 -1.6 -3.6 -0.9 -3.8 -1.4 -2.6 -1.8 -3.4 -3.8 -0.3 9.5 diff --git a/code/lib/Bio/Align/substitution_matrices/data/BENNER74 b/code/lib/Bio/Align/substitution_matrices/data/BENNER74 new file mode 100644 index 0000000..62000b1 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/BENNER74 @@ -0,0 +1,27 @@ +# S.A. Benner, M.A. Cohen, G.H. Gonnet: +# "Amino acid substitution during functionally constrained divergent evolution +# of protein sequences". +# Protein Engineering 7(11): 1323-1332 (1994). +# Figure 3C. +# PMID 7700864 + A C D E F G H I K L M N P Q R S T V W Y +A 2.4 0.3 -0.3 -0.1 -2.6 0.6 -1.0 -0.8 -0.4 -1.4 -0.8 -0.2 0.4 -0.3 -0.8 1.1 0.7 0.1 -4.1 -2.6 +C 0.3 11.8 -3.2 -3.2 -0.7 -2.0 -1.3 -1.2 -2.9 -1.6 -1.2 -1.8 -3.1 -2.6 -2.2 0.1 -0.6 -0.2 -0.9 -0.4 +D -0.3 -3.2 4.8 2.9 -4.7 0.2 0.4 -3.9 0.4 -4.2 -3.2 2.2 -1.0 0.8 -0.5 0.4 -0.2 -2.9 -5.5 -2.8 +E -0.1 -3.2 2.9 3.7 -4.3 -0.5 0.2 -2.9 1.2 -3.1 -2.2 1.0 -0.7 1.7 0.3 0.1 -0.2 -2.1 -4.7 -3.0 +F -2.6 -0.7 -4.7 -4.3 7.2 -5.4 0.0 0.9 -3.6 2.1 1.3 -3.2 -3.8 -2.8 -3.5 -2.6 -2.2 0.1 3.0 5.3 +G 0.6 -2.0 0.2 -0.5 -5.4 6.6 -1.6 -4.3 -1.1 -4.6 -3.5 0.4 -1.7 -1.1 -1.0 0.4 -1.0 -3.1 -4.1 -4.3 +H -1.0 -1.3 0.4 0.2 0.0 -1.6 6.1 -2.3 0.6 -1.9 -1.5 1.2 -1.0 1.4 1.0 -0.3 -0.5 -2.1 -1.0 2.5 +I -0.8 -1.2 -3.9 -2.9 0.9 -4.3 -2.3 4.0 -2.3 2.8 2.6 -2.8 -2.6 -2.0 -2.6 -1.8 -0.3 3.2 -2.3 -1.0 +K -0.4 -2.9 0.4 1.2 -3.6 -1.1 0.6 -2.3 3.4 -2.4 -1.5 0.9 -0.8 1.7 2.9 0.0 0.1 -1.9 -3.6 -2.4 +L -1.4 -1.6 -4.2 -3.1 2.1 -4.6 -1.9 2.8 -2.4 4.2 2.9 -3.1 -2.2 -1.7 -2.4 -2.2 -1.1 1.9 -0.9 -0.1 +M -0.8 -1.2 -3.2 -2.2 1.3 -3.5 -1.5 2.6 -1.5 2.9 4.5 -2.2 -2.4 -1.0 -1.8 -1.4 -0.4 1.8 -1.3 -0.5 +N -0.2 -1.8 2.2 1.0 -3.2 0.4 1.2 -2.8 0.9 -3.1 -2.2 3.6 -1.0 0.7 0.3 0.9 0.4 -2.2 -4.0 -1.4 +P 0.4 -3.1 -1.0 -0.7 -3.8 -1.7 -1.0 -2.6 -0.8 -2.2 -2.4 -1.0 7.5 -0.2 -0.1 0.5 0.1 -1.9 -5.2 -3.4 +Q -0.3 -2.6 0.8 1.7 -2.8 -1.1 1.4 -2.0 1.7 -1.7 -1.0 0.7 -0.2 3.0 1.6 0.1 -0.1 -1.7 -2.8 -1.8 +R -0.8 -2.2 -0.5 0.3 -3.5 -1.0 1.0 -2.6 2.9 -2.4 -1.8 0.3 -0.1 1.6 4.8 -0.2 -0.3 -2.2 -1.6 -2.0 +S 1.1 0.1 0.4 0.1 -2.6 0.4 -0.3 -1.8 0.0 -2.2 -1.4 0.9 0.5 0.1 -0.2 2.1 1.4 -1.0 -3.4 -1.9 +T 0.7 -0.6 -0.2 -0.2 -2.2 -1.0 -0.5 -0.3 0.1 -1.1 -0.4 0.4 0.1 -0.1 -0.3 1.4 2.5 0.2 -3.7 -2.1 +V 0.1 -0.2 -2.9 -2.1 0.1 -3.1 -2.1 3.2 -1.9 1.9 1.8 -2.2 -1.9 -1.7 -2.2 -1.0 0.2 3.4 -2.9 -1.4 +W -4.1 -0.9 -5.5 -4.7 3.0 -4.1 -1.0 -2.3 -3.6 -0.9 -1.3 -4.0 -5.2 -2.8 -1.6 -3.4 -3.7 -2.9 14.7 3.6 +Y -2.6 -0.4 -2.8 -3.0 5.3 -4.3 2.5 -1.0 -2.4 -0.1 -0.5 -1.4 -3.4 -1.8 -2.0 -1.9 -2.1 -1.4 3.6 8.1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM45 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM45 new file mode 100644 index 0000000..18c3323 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM45 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum45.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 45 +# Entropy = 0.3795, Expected = -0.2789 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 -1 -1 0 -5 +R -2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 -1 0 -1 -5 +N -1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 4 0 -1 -5 +D -2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 5 1 -1 -5 +C -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -2 -3 -2 -5 +Q -1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 0 4 -1 -5 +E -1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 1 4 -1 -5 +G 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -5 +H -2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 0 0 -1 -5 +I -1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 -3 -3 -1 -5 +L -1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 -3 -2 -1 -5 +K -1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 0 1 -1 -5 +M -1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 -2 -1 -1 -5 +F -2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 -3 -3 -1 -5 +P -1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 -2 -1 -1 -5 +S 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 0 0 0 -5 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 0 -1 0 -5 +W -2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 -4 -2 -2 -5 +Y -2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 -2 -2 -1 -5 +V 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 -3 -3 -1 -5 +B -1 -1 4 5 -2 0 1 -1 0 -3 -3 0 -2 -3 -2 0 0 -4 -2 -3 4 2 -1 -5 +Z -1 0 0 1 -3 4 4 -2 0 -3 -2 1 -1 -3 -1 0 -1 -2 -2 -3 2 4 -1 -5 +X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -2 -1 -1 -1 -1 -1 -5 +* -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM50 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM50 new file mode 100644 index 0000000..3f62e3c --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM50 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum50.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 50 +# Entropy = 0.4808, Expected = -0.3573 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0 -2 -1 -1 -5 +R -2 7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3 -1 0 -1 -5 +N -1 -1 7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3 4 0 -1 -5 +D -2 -2 2 8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4 5 1 -1 -5 +C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -3 -2 -5 +Q -1 1 0 0 -3 7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3 0 4 -1 -5 +E -1 0 0 2 -3 2 6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3 1 5 -1 -5 +G 0 -3 0 -1 -3 -2 -3 8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4 -1 -2 -2 -5 +H -2 0 1 -1 -3 1 0 -2 10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4 0 0 -1 -5 +I -1 -4 -3 -4 -2 -3 -4 -4 -4 5 2 -3 2 0 -3 -3 -1 -3 -1 4 -4 -3 -1 -5 +L -2 -3 -4 -4 -2 -2 -3 -4 -3 2 5 -3 3 1 -4 -3 -1 -2 -1 1 -4 -3 -1 -5 +K -1 3 0 -1 -3 2 1 -2 0 -3 -3 6 -2 -4 -1 0 -1 -3 -2 -3 0 1 -1 -5 +M -1 -2 -2 -4 -2 0 -2 -3 -1 2 3 -2 7 0 -3 -2 -1 -1 0 1 -3 -1 -1 -5 +F -3 -3 -4 -5 -2 -4 -3 -4 -1 0 1 -4 0 8 -4 -3 -2 1 4 -1 -4 -4 -2 -5 +P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -1 -2 -5 +S 1 -1 1 0 -1 0 -1 0 -1 -3 -3 0 -2 -3 -1 5 2 -4 -2 -2 0 0 -1 -5 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 2 5 -3 -2 0 0 -1 0 -5 +W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1 1 -4 -4 -3 15 2 -3 -5 -2 -3 -5 +Y -2 -1 -2 -3 -3 -1 -2 -3 2 -1 -1 -2 0 4 -3 -2 -2 2 8 -1 -3 -2 -1 -5 +V 0 -3 -3 -4 -1 -3 -3 -4 -4 4 1 -3 1 -1 -3 -2 0 -3 -1 5 -4 -3 -1 -5 +B -2 -1 4 5 -3 0 1 -1 0 -4 -4 0 -3 -4 -2 0 0 -5 -3 -4 5 2 -1 -5 +Z -1 0 0 1 -3 4 5 -2 0 -3 -3 1 -1 -4 -1 0 -1 -2 -2 -3 2 5 -1 -5 +X -1 -1 -1 -1 -2 -1 -1 -2 -1 -1 -1 -1 -1 -2 -2 -1 0 -3 -1 -1 -1 -1 -1 -5 +* -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM62 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM62 new file mode 100644 index 0000000..205f139 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM62 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum62.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 62 +# Entropy = 0.6979, Expected = -0.5209 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 +B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 +Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 +* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM80 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM80 new file mode 100644 index 0000000..78172a3 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM80 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum80_3.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 80 +# Entropy = 0.9868, Expected = -0.7442 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 -3 -2 -1 -8 +R -3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 -2 0 -2 -8 +N -3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 5 -1 -2 -8 +D -3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 6 1 -3 -8 +C -1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 -6 -7 -4 -8 +Q -2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 -1 5 -2 -8 +E -2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 1 6 -2 -8 +G 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 -2 -4 -3 -8 +H -3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 -1 0 -2 -8 +I -3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 -6 -6 -2 -8 +L -3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 -7 -5 -2 -8 +K -1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 -1 1 -2 -8 +M -2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 -5 -3 -2 -8 +F -4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 -6 -6 -3 -8 +P -1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 -4 -2 -3 -8 +S 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 -1 -1 -8 +T 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 -1 -2 -1 -8 +W -5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 -8 -5 -5 -8 +Y -4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 -5 -4 -3 -8 +V -1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 -6 -4 -2 -8 +B -3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 6 0 -3 -8 +Z -2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 6 -1 -8 +X -1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 -3 -1 -2 -8 +* -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM90 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM90 new file mode 100644 index 0000000..71441b5 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM90 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum90.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 90 +# Entropy = 1.1806, Expected = -0.8887 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 5 -2 -2 -3 -1 -1 -1 0 -2 -2 -2 -1 -2 -3 -1 1 0 -4 -3 -1 -2 -1 -1 -6 +R -2 6 -1 -3 -5 1 -1 -3 0 -4 -3 2 -2 -4 -3 -1 -2 -4 -3 -3 -2 0 -2 -6 +N -2 -1 7 1 -4 0 -1 -1 0 -4 -4 0 -3 -4 -3 0 0 -5 -3 -4 4 -1 -2 -6 +D -3 -3 1 7 -5 -1 1 -2 -2 -5 -5 -1 -4 -5 -3 -1 -2 -6 -4 -5 4 0 -2 -6 +C -1 -5 -4 -5 9 -4 -6 -4 -5 -2 -2 -4 -2 -3 -4 -2 -2 -4 -4 -2 -4 -5 -3 -6 +Q -1 1 0 -1 -4 7 2 -3 1 -4 -3 1 0 -4 -2 -1 -1 -3 -3 -3 -1 4 -1 -6 +E -1 -1 -1 1 -6 2 6 -3 -1 -4 -4 0 -3 -5 -2 -1 -1 -5 -4 -3 0 4 -2 -6 +G 0 -3 -1 -2 -4 -3 -3 6 -3 -5 -5 -2 -4 -5 -3 -1 -3 -4 -5 -5 -2 -3 -2 -6 +H -2 0 0 -2 -5 1 -1 -3 8 -4 -4 -1 -3 -2 -3 -2 -2 -3 1 -4 -1 0 -2 -6 +I -2 -4 -4 -5 -2 -4 -4 -5 -4 5 1 -4 1 -1 -4 -3 -1 -4 -2 3 -5 -4 -2 -6 +L -2 -3 -4 -5 -2 -3 -4 -5 -4 1 5 -3 2 0 -4 -3 -2 -3 -2 0 -5 -4 -2 -6 +K -1 2 0 -1 -4 1 0 -2 -1 -4 -3 6 -2 -4 -2 -1 -1 -5 -3 -3 -1 1 -1 -6 +M -2 -2 -3 -4 -2 0 -3 -4 -3 1 2 -2 7 -1 -3 -2 -1 -2 -2 0 -4 -2 -1 -6 +F -3 -4 -4 -5 -3 -4 -5 -5 -2 -1 0 -4 -1 7 -4 -3 -3 0 3 -2 -4 -4 -2 -6 +P -1 -3 -3 -3 -4 -2 -2 -3 -3 -4 -4 -2 -3 -4 8 -2 -2 -5 -4 -3 -3 -2 -2 -6 +S 1 -1 0 -1 -2 -1 -1 -1 -2 -3 -3 -1 -2 -3 -2 5 1 -4 -3 -2 0 -1 -1 -6 +T 0 -2 0 -2 -2 -1 -1 -3 -2 -1 -2 -1 -1 -3 -2 1 6 -4 -2 -1 -1 -1 -1 -6 +W -4 -4 -5 -6 -4 -3 -5 -4 -3 -4 -3 -5 -2 0 -5 -4 -4 11 2 -3 -6 -4 -3 -6 +Y -3 -3 -3 -4 -4 -3 -4 -5 1 -2 -2 -3 -2 3 -4 -3 -2 2 8 -3 -4 -3 -2 -6 +V -1 -3 -4 -5 -2 -3 -3 -5 -4 3 0 -3 0 -2 -3 -2 -1 -3 -3 5 -4 -3 -2 -6 +B -2 -2 4 4 -4 -1 0 -2 -1 -5 -5 -1 -4 -4 -3 0 -1 -6 -4 -4 4 0 -2 -6 +Z -1 0 -1 0 -5 4 4 -3 0 -4 -4 1 -2 -4 -2 -1 -1 -4 -3 -3 0 4 -1 -6 +X -1 -2 -2 -2 -3 -1 -2 -2 -2 -2 -2 -1 -1 -2 -2 -1 -1 -3 -2 -2 -2 -1 -2 -6 +* -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/DAYHOFF b/code/lib/Bio/Align/substitution_matrices/data/DAYHOFF new file mode 100644 index 0000000..e8aecac --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/DAYHOFF @@ -0,0 +1,27 @@ +# M.O. Dayhoff, R.M. Schwartz, and B.C. Orcutt: +# "A Model of Evolutionary Change in Proteins." +# Margaret O. Dayhoff: Atlas of Protein Sequence and Structure, +# Volume 5, Supplement 3, 1978, pages 345-352. +# The National Biomedical Research Foundation, 1979. +# Figure 84, page 352. + A C D E F G H I K L M N P Q R S T V W Y +A 0.2 -0.2 0.0 0.0 -0.4 0.1 -0.1 -0.1 -0.1 -0.2 -0.1 0.0 0.1 0.0 -0.2 0.1 0.1 0.0 -0.6 -0.3 +C -0.2 1.2 -0.5 -0.5 -0.4 -0.3 -0.3 -0.2 -0.5 -0.6 -0.5 -0.4 -0.3 -0.5 -0.4 0.0 -0.2 -0.2 -0.8 0.0 +D 0.0 -0.5 0.4 0.3 -0.6 0.1 0.1 -0.2 0.0 -0.4 -0.3 0.2 -0.1 0.2 -0.1 0.0 0.0 -0.2 -0.7 -0.4 +E 0.0 -0.5 0.3 0.4 -0.5 0.0 0.1 -0.2 0.0 -0.3 -0.2 0.1 -0.1 0.2 -0.1 0.0 0.0 -0.2 -0.7 -0.4 +F -0.4 -0.4 -0.6 -0.5 0.9 -0.5 -0.2 0.1 -0.5 0.2 0.0 -0.4 -0.5 -0.5 -0.4 -0.3 -0.3 -0.1 0.0 0.7 +G 0.1 -0.3 0.1 0.0 -0.5 0.5 -0.2 -0.3 -0.2 -0.4 -0.3 0.0 -0.1 -0.1 -0.3 0.1 0.0 -0.1 -0.7 -0.5 +H -0.1 -0.3 0.1 0.1 -0.2 -0.2 0.6 -0.2 0.0 -0.2 -0.2 0.2 0.0 0.3 0.2 -0.1 -0.1 -0.2 -0.3 0.0 +I -0.1 -0.2 -0.2 -0.2 0.1 -0.3 -0.2 0.5 -0.2 0.2 0.2 -0.2 -0.2 -0.2 -0.2 -0.1 0.0 0.4 -0.5 -0.1 +K -0.1 -0.5 0.0 0.0 -0.5 -0.2 0.0 -0.2 0.5 -0.3 0.0 0.1 -0.1 0.1 0.3 0.0 0.0 -0.2 -0.3 -0.4 +L -0.2 -0.6 -0.4 -0.3 0.2 -0.4 -0.2 0.2 -0.3 0.6 0.4 -0.3 -0.3 -0.2 -0.3 -0.3 -0.2 0.2 -0.2 -0.1 +M -0.1 -0.5 -0.3 -0.2 0.0 -0.3 -0.2 0.2 0.0 0.4 0.6 -0.2 -0.2 -0.1 0.0 -0.2 -0.1 0.2 -0.4 -0.2 +N 0.0 -0.4 0.2 0.1 -0.4 0.0 0.2 -0.2 0.1 -0.3 -0.2 0.2 -0.1 0.1 0.0 0.1 0.0 -0.2 -0.4 -0.2 +P 0.1 -0.3 -0.1 -0.1 -0.5 -0.1 0.0 -0.2 -0.1 -0.3 -0.2 -0.1 0.6 0.0 0.0 0.1 0.0 -0.1 -0.6 -0.5 +Q 0.0 -0.5 0.2 0.2 -0.5 -0.1 0.3 -0.2 0.1 -0.2 -0.1 0.1 0.0 0.4 0.1 -0.1 -0.1 -0.2 -0.5 -0.4 +R -0.2 -0.4 -0.1 -0.1 -0.4 -0.3 0.2 -0.2 0.3 -0.3 0.0 0.0 0.0 0.1 0.6 0.0 -0.1 -0.2 0.2 -0.4 +S 0.1 0.0 0.0 0.0 -0.3 0.1 -0.1 -0.1 0.0 -0.3 -0.2 0.1 0.1 -0.1 0.0 0.2 0.1 -0.1 -0.2 -0.3 +T 0.1 -0.2 0.0 0.0 -0.3 0.0 -0.1 0.0 0.0 -0.2 -0.1 0.0 0.0 -0.1 -0.1 0.1 0.3 0.0 -0.5 -0.3 +V 0.0 -0.2 -0.2 -0.2 -0.1 -0.1 -0.2 0.4 -0.2 0.2 0.2 -0.2 -0.1 -0.2 -0.2 -0.1 0.0 0.4 -0.6 -0.2 +W -0.6 -0.8 -0.7 -0.7 0.0 -0.7 -0.3 -0.5 -0.3 -0.2 -0.4 -0.4 -0.6 -0.5 0.2 -0.2 -0.5 -0.6 1.7 0.0 +Y -0.3 0.0 -0.4 -0.4 0.7 -0.5 0.0 -0.1 -0.4 -0.1 -0.2 -0.2 -0.5 -0.4 -0.4 -0.3 -0.3 -0.2 0.0 1.0 diff --git a/code/lib/Bio/Align/substitution_matrices/data/FENG b/code/lib/Bio/Align/substitution_matrices/data/FENG new file mode 100644 index 0000000..ebd5c2d --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/FENG @@ -0,0 +1,26 @@ +# D.F. Feng, M.S. Johnson, R.F. Doolittle: +# "Aligning amino acid sequences: Comparison of commonly used methods." +# Journal of Molecular Evolution 21(2): 112-125 (1985). +# Table 1, upper triangle. +# PMID 6100188 + A C D E F G H I K L M N P Q R S T V W Y +A 6 2 4 4 2 5 2 2 3 2 2 3 5 3 2 5 5 5 2 2 +C 2 6 1 0 3 3 2 2 0 2 2 2 2 1 2 4 2 2 3 3 +D 4 1 6 5 1 4 3 1 3 1 0 5 2 4 2 3 2 3 0 2 +E 4 0 5 6 0 4 2 1 4 1 1 3 3 4 2 3 3 4 1 1 +F 2 3 1 0 6 1 2 4 0 4 2 1 2 1 1 3 1 4 3 5 +G 5 3 4 4 1 6 1 2 2 2 1 3 3 2 3 5 2 4 3 2 +H 2 2 3 2 2 1 6 1 3 3 1 4 3 4 4 3 2 1 1 3 +I 2 2 1 1 4 2 1 6 2 5 4 2 2 1 2 2 3 5 2 3 +K 3 0 3 4 0 2 3 2 6 2 2 4 2 4 5 3 4 3 1 1 +L 2 2 1 1 4 2 3 5 2 6 5 1 3 2 2 2 2 5 4 3 +M 2 2 0 1 2 1 1 4 2 5 6 1 2 2 2 1 3 4 3 2 +N 3 2 5 3 1 3 4 2 4 1 1 6 2 3 2 5 4 2 0 3 +P 5 2 2 3 2 3 3 2 2 3 2 2 6 3 3 4 4 3 2 2 +Q 3 1 4 4 1 2 4 1 4 2 2 3 3 6 3 3 3 2 1 2 +R 2 2 2 2 1 3 4 2 5 2 2 2 3 3 6 3 3 2 2 1 +S 5 4 3 3 3 5 3 2 3 2 1 5 4 3 3 6 5 2 2 3 +T 5 2 2 3 1 2 2 3 4 2 3 4 4 3 3 5 6 3 1 2 +V 5 2 3 4 4 4 1 5 3 5 4 2 3 2 2 2 3 6 3 3 +W 2 3 0 1 3 3 1 2 1 4 3 0 2 1 2 2 1 3 6 3 +Y 2 3 2 1 5 2 3 3 1 3 2 3 2 2 1 3 2 3 3 6 diff --git a/code/lib/Bio/Align/substitution_matrices/data/GENETIC b/code/lib/Bio/Align/substitution_matrices/data/GENETIC new file mode 100644 index 0000000..79fc69b --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/GENETIC @@ -0,0 +1,27 @@ +# S.A. Benner, M.A. Cohen, G.H. Gonnet: +# "Amino acid substitution during functionally constrained divergent evolution +# of protein sequences." +# Figure 5. +# Protein Engineering 7(11): 1323-1332 (1994). +# PMID 7700864 + A C D E F G H I K L M N P Q R S T V W Y +A 4.0 -1.9 1.0 1.3 -2.4 1.2 -2.1 -1.8 -1.9 -2.3 -2.0 -1.7 0.8 -2.1 -1.6 0.1 0.9 1.0 -2.2 -2.4 +C -1.9 5.5 -1.6 -3.0 1.8 1.0 -1.6 -1.9 -3.2 -1.3 -2.7 -1.5 -1.9 -3.1 0.7 1.5 -1.9 -2.2 4.1 2.6 +D 1.0 -1.6 4.8 3.8 -1.7 1.1 1.7 -2.1 0.3 -2.4 -2.5 1.7 -2.2 0.3 -2.3 -2.1 -2.1 1.0 -2.9 2.3 +E 1.3 -3.0 3.8 5.7 -2.9 1.4 0.3 -2.3 2.0 -2.5 -1.8 0.3 -2.1 2.0 -2.0 -2.8 -2.1 1.3 -3.2 -0.9 +F -2.4 1.8 -1.7 -2.9 4.5 -1.9 -1.1 1.3 -2.8 2.2 0.5 -1.3 -1.8 -2.1 -1.5 0.0 -2.1 1.0 0.0 2.0 +G 1.2 1.0 1.1 1.4 -1.9 4.2 -2.2 -2.5 -2.2 -2.2 -2.3 -2.6 -1.8 -2.1 0.8 -0.6 -2.1 1.1 1.4 -1.8 +H -2.1 -1.6 1.7 0.3 -1.1 -2.2 4.7 -1.8 0.6 -0.1 -1.8 1.8 0.7 3.6 3.6 -1.6 -1.8 -2.1 -2.1 2.3 +I -1.8 -1.9 -2.1 -2.3 1.3 -2.5 -1.8 4.1 0.7 1.2 3.3 0.9 -1.6 -1.9 -1.2 -0.5 0.8 1.0 -2.2 -1.6 +K -1.9 -3.2 0.3 2.0 -2.8 -2.2 0.6 0.7 5.6 -2.0 1.6 3.5 -1.5 2.2 -0.2 -1.5 1.0 -2.1 -3.0 -0.8 +L -2.3 -1.3 -2.4 -2.5 2.2 -2.2 -0.1 1.2 -2.0 3.4 1.5 -2.2 0.0 0.1 -0.4 -1.2 -1.9 1.1 -0.3 -1.6 +M -2.0 -2.7 -2.5 -1.8 0.5 -2.3 -1.8 3.3 1.6 1.5 5.4 0.1 -1.4 -1.2 -0.4 -1.3 0.7 1.0 -2.0 -2.9 +N -1.7 -1.5 1.7 0.3 -1.3 -2.6 1.8 0.9 3.5 -2.2 0.1 4.7 -1.6 0.4 -1.5 -0.3 0.9 -2.2 -3.0 2.5 +P 0.8 -1.9 -2.2 -2.1 -1.8 -1.8 0.7 -1.6 -1.5 0.0 -1.4 -1.6 3.8 1.0 0.3 0.4 1.1 -2.1 -1.6 -2.3 +Q -2.1 -3.1 0.3 2.0 -2.1 -2.1 3.6 -1.9 2.2 0.1 -1.2 0.4 1.0 5.5 0.3 -2.3 -1.7 -2.0 -2.3 -0.8 +R -1.6 0.7 -2.3 -2.0 -1.5 0.8 3.6 -1.2 -0.2 -0.4 -0.4 -1.5 0.3 0.3 2.9 0.3 -0.6 -2.1 1.8 -1.9 +S 0.1 1.5 -2.1 -2.8 0.0 -0.6 -1.6 -0.5 -1.5 -1.2 -1.3 -0.3 0.4 -2.3 0.3 2.6 1.0 -2.2 0.8 0.3 +T 0.9 -1.9 -2.1 -2.1 -2.1 -2.1 -1.8 0.8 1.0 -1.9 0.7 0.9 1.1 -1.7 -0.6 1.0 4.0 -2.2 -2.2 -2.1 +V 1.0 -2.2 1.0 1.3 1.0 1.1 -2.1 1.0 -2.1 1.1 1.0 -2.2 -2.1 -2.0 -2.1 -2.2 -2.2 4.1 -2.1 -2.2 +W -2.2 4.1 -2.9 -3.2 0.0 1.4 -2.1 -2.2 -3.0 -0.3 -2.0 -3.0 -1.6 -2.3 1.8 0.8 -2.2 -2.1 7.5 -0.5 +Y -2.4 2.6 2.3 -0.9 2.0 -1.8 2.3 -1.6 -0.8 -1.6 -2.9 2.5 -2.3 -0.8 -1.9 0.3 -2.1 -2.2 -0.5 6.5 diff --git a/code/lib/Bio/Align/substitution_matrices/data/GONNET1992 b/code/lib/Bio/Align/substitution_matrices/data/GONNET1992 new file mode 100644 index 0000000..ac4e821 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/GONNET1992 @@ -0,0 +1,26 @@ +# Gaston H. Gonnet, Mark A. Cohen, Steven A. Benner: +# "Exhaustive matching of the entire protein sequence database." +# Science 256(5062): 1443-1445 (1992). +# Figure 2. +# PMID 1604319 + A C D E F G H I K L M N P Q R S T V W Y +A 2.4 0.5 -0.3 0.0 -2.3 0.5 -0.8 -0.8 -0.4 -1.2 -0.7 -0.3 0.3 -0.2 -0.6 1.1 0.6 0.1 -3.6 -2.2 +C 0.5 11.5 -3.2 -3.0 -0.8 -2.0 -1.3 -1.1 -2.8 -1.5 -0.9 -1.8 -3.1 -2.4 -2.2 0.1 -0.5 0.0 -1.0 -0.5 +D -0.3 -3.2 4.7 2.7 -4.5 0.1 0.4 -3.8 0.5 -4.0 -3.0 2.2 -0.7 0.9 -0.3 0.5 0.0 -2.9 -5.2 -2.8 +E 0.0 -3.0 2.7 3.6 -3.9 -0.8 0.4 -2.7 1.2 -2.8 -2.0 0.9 -0.5 1.7 0.4 0.2 -0.1 -1.9 -4.3 -2.7 +F -2.3 -0.8 -4.5 -3.9 7.0 -5.2 -0.1 1.0 -3.3 2.0 1.6 -3.1 -3.8 -2.6 -3.2 -2.8 -2.2 0.1 3.6 5.1 +G 0.5 -2.0 0.1 -0.8 -5.2 6.6 -1.4 -4.5 -1.1 -4.4 -3.5 0.4 -1.6 -1.0 -1.0 0.4 -1.1 -3.3 -4.0 -4.0 +H -0.8 -1.3 0.4 0.4 -0.1 -1.4 6.0 -2.2 0.6 -1.9 -1.3 1.2 -1.1 1.2 0.6 -0.2 -0.3 -2.0 -0.8 2.2 +I -0.8 -1.1 -3.8 -2.7 1.0 -4.5 -2.2 4.0 -2.1 2.8 2.5 -2.8 -2.6 -1.9 -2.4 -1.8 -0.6 3.1 -1.8 -0.7 +K -0.4 -2.8 0.5 1.2 -3.3 -1.1 0.6 -2.1 3.2 -2.1 -1.4 0.8 -0.6 1.5 2.7 0.1 0.1 -1.7 -3.5 -2.1 +L -1.2 -1.5 -4.0 -2.8 2.0 -4.4 -1.9 2.8 -2.1 4.0 2.8 -3.0 -2.3 -1.6 -2.2 -2.1 -1.3 1.8 -0.7 0.0 +M -0.7 -0.9 -3.0 -2.0 1.6 -3.5 -1.3 2.5 -1.4 2.8 4.3 -2.2 -2.4 -1.0 -1.7 -1.4 -0.6 1.6 -1.0 -0.2 +N -0.3 -1.8 2.2 0.9 -3.1 0.4 1.2 -2.8 0.8 -3.0 -2.2 3.8 -0.9 0.7 0.3 0.9 0.5 -2.2 -3.6 -1.4 +P 0.3 -3.1 -0.7 -0.5 -3.8 -1.6 -1.1 -2.6 -0.6 -2.3 -2.4 -0.9 7.6 -0.2 -0.9 0.4 0.1 -1.8 -5.0 -3.1 +Q -0.2 -2.4 0.9 1.7 -2.6 -1.0 1.2 -1.9 1.5 -1.6 -1.0 0.7 -0.2 2.7 1.5 0.2 0.0 -1.5 -2.7 -1.7 +R -0.6 -2.2 -0.3 0.4 -3.2 -1.0 0.6 -2.4 2.7 -2.2 -1.7 0.3 -0.9 1.5 4.7 -0.2 -0.2 -2.0 -1.6 -1.8 +S 1.1 0.1 0.5 0.2 -2.8 0.4 -0.2 -1.8 0.1 -2.1 -1.4 0.9 0.4 0.2 -0.2 2.2 1.5 -1.0 -3.3 -1.9 +T 0.6 -0.5 0.0 -0.1 -2.2 -1.1 -0.3 -0.6 0.1 -1.3 -0.6 0.5 0.1 0.0 -0.2 1.5 2.5 0.0 -3.5 -1.9 +V 0.1 0.0 -2.9 -1.9 0.1 -3.3 -2.0 3.1 -1.7 1.8 1.6 -2.2 -1.8 -1.5 -2.0 -1.0 0.0 3.4 -2.6 -1.1 +W -3.6 -1.0 -5.2 -4.3 3.6 -4.0 -0.8 -1.8 -3.5 -0.7 -1.0 -3.6 -5.0 -2.7 -1.6 -3.3 -3.5 -2.6 14.2 4.1 +Y -2.2 -0.5 -2.8 -2.7 5.1 -4.0 2.2 -0.7 -2.1 0.0 -0.2 -1.4 -3.1 -1.7 -1.8 -1.9 -1.9 -1.1 4.1 7.8 diff --git a/code/lib/Bio/Align/substitution_matrices/data/HOXD70 b/code/lib/Bio/Align/substitution_matrices/data/HOXD70 new file mode 100644 index 0000000..4cbd0f6 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/HOXD70 @@ -0,0 +1,9 @@ +# F. Chiaromonte, V.B. Yap, W. Miller: +# "Scoring pairwise genomic sequence alignments" +# Pacific Symposium on Biocomputing 2002: 115-26 (2002). +# PMID 11928468 + A C G T +A 91 -114 -31 -123 +C -114 100 -125 -31 +G -31 -125 100 -114 +T -123 -31 -114 91 diff --git a/code/lib/Bio/Align/substitution_matrices/data/JOHNSON b/code/lib/Bio/Align/substitution_matrices/data/JOHNSON new file mode 100644 index 0000000..7d30964 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/JOHNSON @@ -0,0 +1,27 @@ +# Mark S. Johnson and John P. Overington: +# "A structural basis for sequence comparisons. An evaluation of scoring +# methodologies." +# Journal of Molecular Biology 233(4): 716-738 (1993). +# Table 3, upper triangle. +# PMID 8411177 + A C D E F G H I K L M N P Q R S T V W Y +A 0.60 -0.34 -0.16 -0.07 -0.32 -0.05 -0.31 -0.22 -0.09 -0.33 -0.15 -0.14 -0.10 -0.06 -0.16 0.00 -0.08 -0.05 -0.58 -0.40 +C -0.34 1.61 -0.97 -0.69 -0.44 -0.82 -0.82 -0.77 -0.87 -0.87 -0.44 -0.76 -0.89 -0.69 -0.56 -0.77 -0.60 -0.48 -0.91 -0.77 +D -0.16 -0.97 0.85 0.24 -0.70 -0.21 -0.07 -0.48 -0.15 -0.80 -0.59 0.26 -0.10 -0.11 -0.34 -0.02 -0.18 -0.52 -0.60 -0.38 +E -0.07 -0.69 0.24 0.86 -0.64 -0.25 -0.23 -0.48 0.11 -0.56 -0.28 -0.07 -0.15 0.24 -0.02 -0.22 -0.05 -0.42 -0.76 -0.37 +F -0.32 -0.44 -0.70 -0.64 1.04 -0.86 -0.17 0.05 -0.56 0.18 -0.06 -0.38 -0.50 -0.64 -0.60 -0.48 -0.50 -0.13 0.34 0.34 +G -0.05 -0.82 -0.21 -0.25 -0.86 0.80 -0.32 -0.55 -0.35 -0.72 -0.52 -0.14 -0.25 -0.28 -0.28 -0.13 -0.38 -0.56 -0.63 -0.54 +H -0.31 -0.82 -0.07 -0.23 -0.17 -0.32 1.27 -0.51 0.01 -0.42 -0.23 0.17 -0.43 0.14 0.01 -0.26 -0.30 -0.39 -0.40 -0.04 +I -0.22 -0.77 -0.48 -0.48 0.05 -0.55 -0.51 0.81 -0.47 0.26 0.26 -0.47 -0.57 -0.70 -0.54 -0.47 -0.32 0.39 -0.33 -0.25 +K -0.09 -0.87 -0.15 0.11 -0.56 -0.35 0.01 -0.47 0.76 -0.34 -0.19 0.01 -0.06 0.11 0.32 -0.15 -0.02 -0.37 -0.54 -0.37 +L -0.33 -0.87 -0.80 -0.56 0.18 -0.72 -0.42 0.26 -0.34 0.73 0.44 -0.48 -0.28 -0.44 -0.37 -0.52 -0.46 0.18 -0.10 -0.24 +M -0.15 -0.44 -0.59 -0.28 -0.06 -0.52 -0.23 0.26 -0.19 0.44 1.12 -0.37 -0.98 -0.06 -0.42 -0.48 -0.32 0.07 -0.09 -0.13 +N -0.14 -0.76 0.26 -0.07 -0.38 -0.14 0.17 -0.47 0.01 -0.48 -0.37 0.80 -0.24 -0.08 -0.15 0.10 0.01 -0.57 -0.61 -0.13 +P -0.10 -0.89 -0.10 -0.15 -0.50 -0.25 -0.43 -0.57 -0.06 -0.28 -0.98 -0.24 1.03 -0.36 -0.36 -0.10 -0.20 -0.52 -0.74 -0.70 +Q -0.06 -0.69 -0.11 0.24 -0.64 -0.28 0.14 -0.70 0.11 -0.44 -0.06 -0.08 -0.36 0.90 0.21 -0.12 -0.04 -0.36 -0.82 -0.51 +R -0.16 -0.56 -0.34 -0.02 -0.60 -0.28 0.01 -0.54 0.32 -0.37 -0.42 -0.15 -0.36 0.21 1.00 -0.06 -0.14 -0.49 -0.38 -0.21 +S 0.00 -0.77 -0.02 -0.22 -0.48 -0.13 -0.26 -0.47 -0.15 -0.52 -0.48 0.10 -0.10 -0.12 -0.06 0.58 0.20 -0.43 -0.62 -0.34 +T -0.08 -0.60 -0.18 -0.05 -0.50 -0.38 -0.30 -0.32 -0.02 -0.46 -0.32 0.01 -0.20 -0.04 -0.14 0.20 0.68 -0.19 -0.93 -0.27 +V -0.05 -0.48 -0.52 -0.42 -0.13 -0.56 -0.39 0.39 -0.37 0.18 0.07 -0.57 -0.52 -0.36 -0.49 -0.43 -0.19 0.70 -0.49 -0.18 +W -0.58 -0.91 -0.60 -0.76 0.34 -0.63 -0.40 -0.33 -0.54 -0.10 -0.09 -0.61 -0.74 -0.82 -0.38 -0.62 -0.93 -0.49 1.52 0.23 +Y -0.40 -0.77 -0.38 -0.37 0.34 -0.54 -0.04 -0.25 -0.37 -0.24 -0.13 -0.13 -0.70 -0.51 -0.21 -0.34 -0.27 -0.18 0.23 1.05 diff --git a/code/lib/Bio/Align/substitution_matrices/data/JONES b/code/lib/Bio/Align/substitution_matrices/data/JONES new file mode 100644 index 0000000..daed995 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/JONES @@ -0,0 +1,26 @@ +# David T. Jones, William R. Taylor, Janet M. Thornton: +# "The rapid generation of mutation data matrices from protein sequences." +# Computer Applications in the Biosciences: CABIOS 8(3): 275-282 (1992). +# Table I, lower triangle. +# PMID 1633570 + A R N D C Q E G H I L K M F P S T W Y V +A 0.2 -0.1 0.0 0.0 -0.1 -0.1 -0.1 0.1 -0.2 0.0 -0.1 -0.1 -0.1 -0.3 0.1 0.1 0.2 -0.4 -0.3 0.1 +R -0.1 0.5 0.0 -0.1 -0.1 0.2 0.0 0.0 0.2 -0.3 -0.3 0.4 -0.2 -0.4 -0.1 -0.1 -0.1 0.0 -0.2 -0.3 +N 0.0 0.0 0.3 0.2 -0.1 0.0 0.1 0.0 0.1 -0.2 -0.3 0.1 -0.2 -0.3 -0.1 0.1 0.1 -0.5 -0.1 -0.2 +D 0.0 -0.1 0.2 0.5 -0.3 0.1 0.4 0.1 0.0 -0.3 -0.4 0.0 -0.3 -0.5 -0.2 0.0 -0.1 -0.5 -0.2 -0.2 +C -0.1 -0.1 -0.1 -0.3 1.1 -0.3 -0.4 -0.1 0.0 -0.2 -0.3 -0.3 -0.2 0.0 -0.2 0.1 -0.1 0.1 0.2 -0.2 +Q -0.1 0.2 0.0 0.1 -0.3 0.5 0.2 -0.1 0.2 -0.3 -0.2 0.2 -0.2 -0.4 0.0 -0.1 -0.1 -0.3 -0.2 -0.3 +E -0.1 0.0 0.1 0.4 -0.4 0.2 0.5 0.0 0.0 -0.3 -0.4 0.1 -0.3 -0.5 -0.2 -0.1 -0.1 -0.5 -0.4 -0.2 +G 0.1 0.0 0.0 0.1 -0.1 -0.1 0.0 0.5 -0.2 -0.3 -0.4 -0.1 -0.3 -0.5 -0.1 0.1 -0.1 -0.2 -0.4 -0.2 +H -0.2 0.2 0.1 0.0 0.0 0.2 0.0 -0.2 0.6 -0.3 -0.2 0.1 -0.2 0.0 0.0 -0.1 -0.1 -0.3 0.4 -0.3 +I 0.0 -0.3 -0.2 -0.3 -0.2 -0.3 -0.3 -0.3 -0.3 0.4 0.2 -0.3 0.3 0.0 -0.2 -0.1 0.1 -0.4 -0.2 0.4 +L -0.1 -0.3 -0.3 -0.4 -0.3 -0.2 -0.4 -0.4 -0.2 0.2 0.5 -0.3 0.3 0.2 0.0 -0.2 -0.1 -0.2 -0.1 0.2 +K -0.1 0.4 0.1 0.0 -0.3 0.2 0.1 -0.1 0.1 -0.3 -0.3 0.5 -0.2 -0.5 -0.2 -0.1 -0.1 -0.3 -0.3 -0.3 +M -0.1 -0.2 -0.2 -0.3 -0.2 -0.2 -0.3 -0.3 -0.2 0.3 0.3 -0.2 0.6 0.0 -0.2 -0.1 0.0 -0.3 -0.2 0.2 +F -0.3 -0.4 -0.3 -0.5 0.0 -0.4 -0.5 -0.5 0.0 0.0 0.2 -0.5 0.0 0.8 -0.3 -0.2 -0.2 -0.1 0.5 0.0 +P 0.1 -0.1 -0.1 -0.2 -0.2 0.0 -0.2 -0.1 0.0 -0.2 0.0 -0.2 -0.2 -0.3 0.6 0.1 0.1 -0.4 -0.3 -0.1 +S 0.1 -0.1 0.1 0.0 0.1 -0.1 -0.1 0.1 -0.1 -0.1 -0.2 -0.1 -0.1 -0.2 0.1 0.2 0.1 -0.3 -0.1 -0.1 +T 0.2 -0.1 0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 0.1 -0.1 -0.1 0.0 -0.2 0.1 0.1 0.2 -0.4 -0.3 0.0 +W -0.4 0.0 -0.5 -0.5 0.1 -0.3 -0.5 -0.2 -0.3 -0.4 -0.2 -0.3 -0.3 -0.1 -0.4 -0.3 -0.4 1.5 0.0 -0.3 +Y -0.3 -0.2 -0.1 -0.2 0.2 -0.2 -0.4 -0.4 0.4 -0.2 -0.1 -0.3 -0.2 0.5 -0.3 -0.1 -0.3 0.0 0.9 -0.3 +V 0.1 -0.3 -0.2 -0.2 -0.2 -0.3 -0.2 -0.2 -0.3 0.4 0.2 -0.3 0.2 0.0 -0.1 -0.1 0.0 -0.3 -0.3 0.4 diff --git a/code/lib/Bio/Align/substitution_matrices/data/LEVIN b/code/lib/Bio/Align/substitution_matrices/data/LEVIN new file mode 100644 index 0000000..2f9c8c4 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/LEVIN @@ -0,0 +1,27 @@ +# Jonathan M. Levin, Barry Robson, Jean Garnier: +# "An algorithm for secondary structure determination in proteins based on +# sequence similarity." +# FEBS Letters 205(2): 303-308 (1986). +# Figure 1. +# PMID 3743779 + A C D E F G H I K L M N P Q R S T V W Y +A 2 0 0 1 -1 0 0 0 0 0 0 0 -1 0 0 1 0 0 -1 -1 +C 0 2 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 +D 0 0 2 1 -1 0 0 -1 0 -1 -1 1 0 0 0 0 0 -1 -1 -1 +E 1 0 1 2 -1 0 0 -1 0 -1 -1 0 -1 1 0 0 0 -1 -1 -1 +F -1 -1 -1 -1 2 -1 -1 1 -1 0 0 -1 -1 -1 -1 -1 -1 0 0 1 +G 0 0 0 0 -1 2 0 -1 0 -1 -1 0 0 0 0 0 0 -1 -1 -1 +H 0 0 0 0 -1 0 2 -1 0 -1 -1 0 0 0 0 0 0 -1 -1 0 +I 0 0 -1 -1 1 -1 -1 2 -1 0 0 -1 -1 -1 -1 -1 0 1 0 0 +K 0 0 0 0 -1 0 0 -1 2 -1 -1 1 0 0 1 0 0 -1 -1 -1 +L 0 0 -1 -1 0 -1 -1 0 -1 2 2 -1 -1 -1 -1 -1 0 1 0 0 +M 0 0 -1 -1 0 -1 -1 0 -1 2 2 -1 -1 -1 -1 -1 0 0 0 0 +N 0 0 1 0 -1 0 0 -1 1 -1 -1 3 0 1 0 0 0 -1 -1 -1 +P -1 0 0 -1 -1 0 0 -1 0 -1 -1 0 3 0 0 0 0 -1 -1 -1 +Q 0 0 0 1 -1 0 0 -1 0 -1 -1 1 0 2 0 0 0 -1 -1 -1 +R 0 0 0 0 -1 0 0 -1 1 -1 -1 0 0 0 2 0 0 -1 0 -1 +S 1 0 0 0 -1 0 0 -1 0 -1 -1 0 0 0 0 2 0 -1 -1 -1 +T 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 2 0 -1 -1 +V 0 0 -1 -1 0 -1 -1 1 -1 1 0 -1 -1 -1 -1 -1 0 2 0 0 +W -1 -1 -1 -1 0 -1 -1 0 -1 0 0 -1 -1 -1 0 -1 -1 0 2 0 +Y -1 -1 -1 -1 1 -1 0 0 -1 0 0 -1 -1 -1 -1 -1 -1 0 0 2 diff --git a/code/lib/Bio/Align/substitution_matrices/data/MCLACHLAN b/code/lib/Bio/Align/substitution_matrices/data/MCLACHLAN new file mode 100644 index 0000000..adf81ce --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/MCLACHLAN @@ -0,0 +1,27 @@ +# A.D. McLachlan: +# "Tests for comparing related amino-acid sequences. Cytochrome c and +# cytochrome c 551." +# Journal of Molecular Biology 61(2): 409-424 (1971). +# Figure 1. +# PMID 5167087 + A C D E F G H I K L M N P Q R S T V W Y +A 8 1 3 4 1 3 3 2 3 2 3 3 4 3 2 4 3 3 1 1 +C 1 9 1 0 0 1 3 1 0 0 3 1 0 0 1 2 2 1 2 1 +D 3 1 8 5 1 3 4 1 3 1 2 5 3 4 1 3 3 1 0 1 +E 4 0 5 8 0 3 2 1 4 1 1 4 4 5 3 4 4 2 1 2 +F 1 0 1 0 9 0 4 3 0 5 5 0 1 0 1 2 1 3 6 6 +G 3 1 3 3 0 8 2 1 3 1 1 3 3 2 3 3 2 2 1 0 +H 3 3 4 2 4 2 8 2 4 2 3 4 3 4 5 3 4 2 3 4 +I 2 1 1 1 3 1 2 8 1 5 5 1 1 0 1 2 3 5 3 3 +K 3 0 3 4 0 3 4 1 8 2 1 4 3 4 5 3 3 2 1 1 +L 2 0 1 1 5 1 2 5 2 8 6 1 1 3 2 2 3 5 3 3 +M 3 3 2 1 5 1 3 5 1 6 8 2 1 3 1 2 3 4 1 2 +N 3 1 5 4 0 3 4 1 4 1 2 8 1 4 3 5 3 1 0 2 +P 4 0 3 4 1 3 3 1 3 1 1 1 8 3 3 3 3 2 0 0 +Q 3 0 4 5 0 2 4 0 4 3 3 4 3 8 5 4 3 2 2 1 +R 2 1 1 3 1 3 5 1 5 2 1 3 3 5 8 4 3 2 3 2 +S 4 2 3 4 2 3 3 2 3 2 2 5 3 4 4 8 5 2 3 3 +T 3 2 3 4 1 2 4 3 3 3 3 3 3 3 3 5 8 3 2 1 +V 3 1 1 2 3 2 2 5 2 5 4 1 2 2 2 2 3 8 2 3 +W 1 2 0 1 6 1 3 3 1 3 1 0 0 2 3 3 2 2 9 6 +Y 1 1 1 2 6 0 4 3 1 3 2 2 0 1 2 3 1 3 6 9 diff --git a/code/lib/Bio/Align/substitution_matrices/data/MDM78 b/code/lib/Bio/Align/substitution_matrices/data/MDM78 new file mode 100644 index 0000000..5d0b2ef --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/MDM78 @@ -0,0 +1,27 @@ +# R.M. Schwartz and M.O. Dayhoff: +# "Matrices for Detecting Distant Relationships." +# Margaret O. Dayhoff: Atlas of Protein Sequence and Structure, +# Volume 5, Supplement 3, 1978, pages 353-358. +# The National Biomedical Research Foundation, 1979. +# Figure 85, page 354. + A R N D C Q E G H I L K M F P S T W Y V +A 0.18 -0.15 0.02 0.03 -0.20 -0.04 0.03 0.13 -0.14 -0.05 -0.19 -0.12 -0.11 -0.35 0.11 0.11 0.12 -0.58 -0.35 0.02 +R -0.15 0.61 0.00 -0.13 -0.36 0.13 -0.11 -0.26 0.16 -0.20 -0.30 0.34 -0.04 -0.45 -0.02 -0.03 -0.09 0.22 -0.42 -0.25 +N 0.02 0.00 0.20 0.21 -0.36 0.08 0.14 0.03 0.16 -0.18 -0.29 0.10 -0.17 -0.35 -0.05 0.07 0.04 -0.42 -0.21 -0.17 +D 0.03 -0.13 0.21 0.39 -0.51 0.16 0.34 0.06 0.07 -0.24 -0.40 0.01 -0.26 -0.56 -0.10 0.03 -0.01 -0.68 -0.43 -0.21 +C -0.20 -0.36 -0.36 -0.51 1.19 -0.54 -0.53 -0.34 -0.34 -0.23 -0.60 -0.54 -0.52 -0.43 -0.28 0.00 -0.22 -0.78 0.03 -0.19 +Q -0.04 0.13 0.08 0.16 -0.54 0.40 0.25 -0.12 0.29 -0.20 -0.18 0.07 -0.10 -0.47 0.02 -0.05 -0.08 -0.48 -0.40 -0.19 +E 0.03 -0.11 0.14 0.34 -0.53 0.25 0.38 0.02 0.07 -0.20 -0.34 -0.01 -0.21 -0.54 -0.06 0.00 -0.04 -0.70 -0.43 -0.18 +G 0.13 -0.26 0.03 0.06 -0.34 -0.12 0.02 0.48 -0.21 -0.26 -0.41 -0.17 -0.28 -0.48 -0.05 0.11 0.00 -0.70 -0.52 -0.14 +H -0.14 0.16 0.16 0.07 -0.34 0.29 0.07 -0.21 0.65 -0.24 -0.21 0.00 -0.21 -0.18 -0.02 -0.08 -0.13 -0.28 -0.01 -0.22 +I -0.05 -0.20 -0.18 -0.24 -0.23 -0.20 -0.20 -0.26 -0.24 0.45 0.24 -0.19 0.22 0.10 -0.20 -0.14 0.01 -0.51 -0.09 0.37 +L -0.19 -0.30 -0.29 -0.40 -0.60 -0.18 -0.34 -0.41 -0.21 0.24 0.59 -0.29 0.37 0.18 -0.25 -0.28 -0.17 -0.18 -0.09 0.19 +K -0.12 0.34 0.10 0.01 -0.54 0.07 -0.01 -0.17 0.00 -0.19 -0.29 0.47 0.04 -0.53 -0.11 -0.02 0.00 -0.35 -0.44 -0.24 +M -0.11 -0.04 -0.17 -0.26 -0.52 -0.10 -0.21 -0.28 -0.21 0.22 0.37 0.04 0.64 0.02 -0.21 -0.16 -0.06 -0.42 -0.24 0.18 +F -0.35 -0.45 -0.35 -0.56 -0.43 -0.47 -0.54 -0.48 -0.18 0.10 0.18 -0.53 0.02 0.91 -0.46 -0.32 -0.31 0.04 0.70 -0.12 +P 0.11 -0.02 -0.05 -0.10 -0.28 0.02 -0.06 -0.05 -0.02 -0.20 -0.25 -0.11 -0.21 -0.46 0.59 0.09 0.03 -0.56 -0.49 -0.12 +S 0.11 -0.03 0.07 0.03 0.00 -0.05 0.00 0.11 -0.08 -0.14 -0.28 -0.02 -0.16 -0.32 0.09 0.16 0.13 -0.25 -0.28 -0.10 +T 0.12 -0.09 0.04 -0.01 -0.22 -0.08 -0.04 0.00 -0.13 0.01 -0.17 0.00 -0.06 -0.31 0.03 0.13 0.26 -0.52 -0.27 0.03 +W -0.58 0.22 -0.42 -0.68 -0.78 -0.48 -0.70 -0.70 -0.28 -0.51 -0.18 -0.35 -0.42 0.04 -0.56 -0.25 -0.52 1.73 -0.02 -0.62 +Y -0.35 -0.42 -0.21 -0.43 0.03 -0.40 -0.43 -0.52 -0.01 -0.09 -0.09 -0.44 -0.24 0.70 -0.49 -0.28 -0.27 -0.02 1.01 -0.25 +V 0.02 -0.25 -0.17 -0.21 -0.19 -0.19 -0.18 -0.14 -0.22 0.37 0.19 -0.24 0.18 -0.12 -0.12 -0.10 0.03 -0.62 -0.25 0.43 diff --git a/code/lib/Bio/Align/substitution_matrices/data/NUC.4.4 b/code/lib/Bio/Align/substitution_matrices/data/NUC.4.4 new file mode 100644 index 0000000..6fb12d2 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/NUC.4.4 @@ -0,0 +1,25 @@ +# +# This matrix was created by Todd Lowe 12/10/92 +# +# Uses ambiguous nucleotide codes, probabilities rounded to +# nearest integer +# +# Lowest score = -4, Highest score = 5 +# + A T G C S W R Y K M B V H D N +A 5 -4 -4 -4 -4 1 1 -4 -4 1 -4 -1 -1 -1 -2 +T -4 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2 +G -4 -4 5 -4 1 -4 1 -4 1 -4 -1 -1 -4 -1 -2 +C -4 -4 -4 5 1 -4 -4 1 -4 1 -1 -1 -1 -4 -2 +S -4 -4 1 1 -1 -4 -2 -2 -2 -2 -1 -1 -3 -3 -1 +W 1 1 -4 -4 -4 -1 -2 -2 -2 -2 -3 -3 -1 -1 -1 +R 1 -4 1 -4 -2 -2 -1 -4 -2 -2 -3 -1 -3 -1 -1 +Y -4 1 -4 1 -2 -2 -4 -1 -2 -2 -1 -3 -1 -3 -1 +K -4 1 1 -4 -2 -2 -2 -2 -1 -4 -1 -3 -3 -1 -1 +M 1 -4 -4 1 -2 -2 -2 -2 -4 -1 -3 -1 -1 -3 -1 +B -4 -1 -1 -1 -1 -3 -3 -1 -1 -3 -1 -2 -2 -2 -1 +V -1 -4 -1 -1 -1 -3 -1 -3 -3 -1 -2 -1 -2 -2 -1 +H -1 -1 -4 -1 -3 -1 -3 -1 -3 -1 -2 -2 -1 -2 -1 +D -1 -1 -1 -4 -3 -1 -1 -3 -1 -3 -2 -2 -2 -1 -1 +N -2 -2 -2 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 + diff --git a/code/lib/Bio/Align/substitution_matrices/data/PAM250 b/code/lib/Bio/Align/substitution_matrices/data/PAM250 new file mode 100644 index 0000000..17e9e60 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/PAM250 @@ -0,0 +1,34 @@ +# +# This matrix was produced by "pam" Version 1.0.6 [28-Jul-93] +# +# PAM 250 substitution matrix, scale = ln(2)/3 = 0.231049 +# +# Expected score = -0.844, Entropy = 0.354 bits +# +# Lowest score = -8, Highest score = 17 +# + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0 -8 +R -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1 -8 +N 0 0 2 2 -4 1 1 0 2 -2 -3 1 -2 -3 0 1 0 -4 -2 -2 2 1 0 -8 +D 0 -1 2 4 -5 2 3 1 1 -2 -4 0 -3 -6 -1 0 0 -7 -4 -2 3 3 -1 -8 +C -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3 0 -2 -8 0 -2 -4 -5 -3 -8 +Q 0 1 1 2 -5 4 2 -1 3 -2 -2 1 -1 -5 0 -1 -1 -5 -4 -2 1 3 -1 -8 +E 0 -1 1 3 -5 2 4 0 1 -2 -3 0 -2 -5 -1 0 0 -7 -4 -2 3 3 -1 -8 +G 1 -3 0 1 -3 -1 0 5 -2 -3 -4 -2 -3 -5 0 1 0 -7 -5 -1 0 0 -1 -8 +H -1 2 2 1 -3 3 1 -2 6 -2 -2 0 -2 -2 0 -1 -1 -3 0 -2 1 2 -1 -8 +I -1 -2 -2 -2 -2 -2 -2 -3 -2 5 2 -2 2 1 -2 -1 0 -5 -1 4 -2 -2 -1 -8 +L -2 -3 -3 -4 -6 -2 -3 -4 -2 2 6 -3 4 2 -3 -3 -2 -2 -1 2 -3 -3 -1 -8 +K -1 3 1 0 -5 1 0 -2 0 -2 -3 5 0 -5 -1 0 0 -3 -4 -2 1 0 -1 -8 +M -1 0 -2 -3 -5 -1 -2 -3 -2 2 4 0 6 0 -2 -2 -1 -4 -2 2 -2 -2 -1 -8 +F -3 -4 -3 -6 -4 -5 -5 -5 -2 1 2 -5 0 9 -5 -3 -3 0 7 -1 -4 -5 -2 -8 +P 1 0 0 -1 -3 0 -1 0 0 -2 -3 -1 -2 -5 6 1 0 -6 -5 -1 -1 0 -1 -8 +S 1 0 1 0 0 -1 0 1 -1 -1 -3 0 -2 -3 1 2 1 -2 -3 -1 0 0 0 -8 +T 1 -1 0 0 -2 -1 0 0 -1 0 -2 0 -1 -3 0 1 3 -5 -3 0 0 -1 0 -8 +W -6 2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4 0 -6 -2 -5 17 0 -6 -5 -6 -4 -8 +Y -3 -4 -2 -4 0 -4 -4 -5 0 -1 -1 -4 -2 7 -5 -3 -3 0 10 -2 -3 -4 -2 -8 +V 0 -2 -2 -2 -2 -2 -2 -1 -2 4 2 -2 2 -1 -1 -1 0 -6 -2 4 -2 -2 -1 -8 +B 0 -1 2 3 -4 1 3 0 1 -2 -3 1 -2 -4 -1 0 0 -5 -3 -2 3 2 -1 -8 +Z 0 0 1 3 -5 3 3 0 2 -2 -3 0 -2 -5 0 0 -1 -6 -4 -2 2 3 -1 -8 +X 0 -1 0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 0 0 -4 -2 -1 -1 -1 -1 -8 +* -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/PAM30 b/code/lib/Bio/Align/substitution_matrices/data/PAM30 new file mode 100644 index 0000000..8a01c88 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/PAM30 @@ -0,0 +1,34 @@ +# +# This matrix was produced by "pam" Version 1.0.6 [28-Jul-93] +# +# PAM 30 substitution matrix, scale = ln(2)/2 = 0.346574 +# +# Expected score = -5.06, Entropy = 2.57 bits +# +# Lowest score = -17, Highest score = 13 +# + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 6 -7 -4 -3 -6 -4 -2 -2 -7 -5 -6 -7 -5 -8 -2 0 -1 -13 -8 -2 -3 -3 -3 -17 +R -7 8 -6 -10 -8 -2 -9 -9 -2 -5 -8 0 -4 -9 -4 -3 -6 -2 -10 -8 -7 -4 -6 -17 +N -4 -6 8 2 -11 -3 -2 -3 0 -5 -7 -1 -9 -9 -6 0 -2 -8 -4 -8 6 -3 -3 -17 +D -3 -10 2 8 -14 -2 2 -3 -4 -7 -12 -4 -11 -15 -8 -4 -5 -15 -11 -8 6 1 -5 -17 +C -6 -8 -11 -14 10 -14 -14 -9 -7 -6 -15 -14 -13 -13 -8 -3 -8 -15 -4 -6 -12 -14 -9 -17 +Q -4 -2 -3 -2 -14 8 1 -7 1 -8 -5 -3 -4 -13 -3 -5 -5 -13 -12 -7 -3 6 -5 -17 +E -2 -9 -2 2 -14 1 8 -4 -5 -5 -9 -4 -7 -14 -5 -4 -6 -17 -8 -6 1 6 -5 -17 +G -2 -9 -3 -3 -9 -7 -4 6 -9 -11 -10 -7 -8 -9 -6 -2 -6 -15 -14 -5 -3 -5 -5 -17 +H -7 -2 0 -4 -7 1 -5 -9 9 -9 -6 -6 -10 -6 -4 -6 -7 -7 -3 -6 -1 -1 -5 -17 +I -5 -5 -5 -7 -6 -8 -5 -11 -9 8 -1 -6 -1 -2 -8 -7 -2 -14 -6 2 -6 -6 -5 -17 +L -6 -8 -7 -12 -15 -5 -9 -10 -6 -1 7 -8 1 -3 -7 -8 -7 -6 -7 -2 -9 -7 -6 -17 +K -7 0 -1 -4 -14 -3 -4 -7 -6 -6 -8 7 -2 -14 -6 -4 -3 -12 -9 -9 -2 -4 -5 -17 +M -5 -4 -9 -11 -13 -4 -7 -8 -10 -1 1 -2 11 -4 -8 -5 -4 -13 -11 -1 -10 -5 -5 -17 +F -8 -9 -9 -15 -13 -13 -14 -9 -6 -2 -3 -14 -4 9 -10 -6 -9 -4 2 -8 -10 -13 -8 -17 +P -2 -4 -6 -8 -8 -3 -5 -6 -4 -8 -7 -6 -8 -10 8 -2 -4 -14 -13 -6 -7 -4 -5 -17 +S 0 -3 0 -4 -3 -5 -4 -2 -6 -7 -8 -4 -5 -6 -2 6 0 -5 -7 -6 -1 -5 -3 -17 +T -1 -6 -2 -5 -8 -5 -6 -6 -7 -2 -7 -3 -4 -9 -4 0 7 -13 -6 -3 -3 -6 -4 -17 +W -13 -2 -8 -15 -15 -13 -17 -15 -7 -14 -6 -12 -13 -4 -14 -5 -13 13 -5 -15 -10 -14 -11 -17 +Y -8 -10 -4 -11 -4 -12 -8 -14 -3 -6 -7 -9 -11 2 -13 -7 -6 -5 10 -7 -6 -9 -7 -17 +V -2 -8 -8 -8 -6 -7 -6 -5 -6 2 -2 -9 -1 -8 -6 -6 -3 -15 -7 7 -8 -6 -5 -17 +B -3 -7 6 6 -12 -3 1 -3 -1 -6 -9 -2 -10 -10 -7 -1 -3 -10 -6 -8 6 0 -5 -17 +Z -3 -4 -3 1 -14 6 6 -5 -1 -6 -7 -4 -5 -13 -4 -5 -6 -14 -9 -6 0 6 -5 -17 +X -3 -6 -3 -5 -9 -5 -5 -5 -5 -5 -6 -5 -5 -8 -5 -3 -4 -11 -7 -5 -5 -5 -5 -17 +* -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/PAM70 b/code/lib/Bio/Align/substitution_matrices/data/PAM70 new file mode 100644 index 0000000..b20cdf0 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/PAM70 @@ -0,0 +1,34 @@ +# +# This matrix was produced by "pam" Version 1.0.6 [28-Jul-93] +# +# PAM 70 substitution matrix, scale = ln(2)/2 = 0.346574 +# +# Expected score = -2.77, Entropy = 1.60 bits +# +# Lowest score = -11, Highest score = 13 +# + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 5 -4 -2 -1 -4 -2 -1 0 -4 -2 -4 -4 -3 -6 0 1 1 -9 -5 -1 -1 -1 -2 -11 +R -4 8 -3 -6 -5 0 -5 -6 0 -3 -6 2 -2 -7 -2 -1 -4 0 -7 -5 -4 -2 -3 -11 +N -2 -3 6 3 -7 -1 0 -1 1 -3 -5 0 -5 -6 -3 1 0 -6 -3 -5 5 -1 -2 -11 +D -1 -6 3 6 -9 0 3 -1 -1 -5 -8 -2 -7 -10 -4 -1 -2 -10 -7 -5 5 2 -3 -11 +C -4 -5 -7 -9 9 -9 -9 -6 -5 -4 -10 -9 -9 -8 -5 -1 -5 -11 -2 -4 -8 -9 -6 -11 +Q -2 0 -1 0 -9 7 2 -4 2 -5 -3 -1 -2 -9 -1 -3 -3 -8 -8 -4 -1 5 -2 -11 +E -1 -5 0 3 -9 2 6 -2 -2 -4 -6 -2 -4 -9 -3 -2 -3 -11 -6 -4 2 5 -3 -11 +G 0 -6 -1 -1 -6 -4 -2 6 -6 -6 -7 -5 -6 -7 -3 0 -3 -10 -9 -3 -1 -3 -3 -11 +H -4 0 1 -1 -5 2 -2 -6 8 -6 -4 -3 -6 -4 -2 -3 -4 -5 -1 -4 0 1 -3 -11 +I -2 -3 -3 -5 -4 -5 -4 -6 -6 7 1 -4 1 0 -5 -4 -1 -9 -4 3 -4 -4 -3 -11 +L -4 -6 -5 -8 -10 -3 -6 -7 -4 1 6 -5 2 -1 -5 -6 -4 -4 -4 0 -6 -4 -4 -11 +K -4 2 0 -2 -9 -1 -2 -5 -3 -4 -5 6 0 -9 -4 -2 -1 -7 -7 -6 -1 -2 -3 -11 +M -3 -2 -5 -7 -9 -2 -4 -6 -6 1 2 0 10 -2 -5 -3 -2 -8 -7 0 -6 -3 -3 -11 +F -6 -7 -6 -10 -8 -9 -9 -7 -4 0 -1 -9 -2 8 -7 -4 -6 -2 4 -5 -7 -9 -5 -11 +P 0 -2 -3 -4 -5 -1 -3 -3 -2 -5 -5 -4 -5 -7 7 0 -2 -9 -9 -3 -4 -2 -3 -11 +S 1 -1 1 -1 -1 -3 -2 0 -3 -4 -6 -2 -3 -4 0 5 2 -3 -5 -3 0 -2 -1 -11 +T 1 -4 0 -2 -5 -3 -3 -3 -4 -1 -4 -1 -2 -6 -2 2 6 -8 -4 -1 -1 -3 -2 -11 +W -9 0 -6 -10 -11 -8 -11 -10 -5 -9 -4 -7 -8 -2 -9 -3 -8 13 -3 -10 -7 -10 -7 -11 +Y -5 -7 -3 -7 -2 -8 -6 -9 -1 -4 -4 -7 -7 4 -9 -5 -4 -3 9 -5 -4 -7 -5 -11 +V -1 -5 -5 -5 -4 -4 -4 -3 -4 3 0 -6 0 -5 -3 -3 -1 -10 -5 6 -5 -4 -2 -11 +B -1 -4 5 5 -8 -1 2 -1 0 -4 -6 -1 -6 -7 -4 0 -1 -7 -4 -5 5 1 -2 -11 +Z -1 -2 -1 2 -9 5 5 -3 1 -4 -4 -2 -3 -9 -2 -2 -3 -10 -7 -4 1 5 -3 -11 +X -2 -3 -2 -3 -6 -2 -3 -3 -3 -3 -4 -3 -3 -5 -3 -1 -2 -7 -5 -2 -2 -3 -3 -11 +* -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/RAO b/code/lib/Bio/Align/substitution_matrices/data/RAO new file mode 100644 index 0000000..f3ef1c0 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/RAO @@ -0,0 +1,27 @@ +# J.K. Mohana Rao: +# "New scoring matrix for amino acid residue exchanges based on residue +# characteristic physical parameters." +# International Journal of Peptide and Protein Research: 29(2): 276-281 (1987). +# Figure 1, lower triangle. +# PMID 3570667 + A C D E F G H I K L M N P Q R S T V W Y +A 16 11 9 10 10 8 11 9 10 11 11 9 6 11 8 10 10 9 11 9 +C 11 16 8 9 10 8 10 8 9 11 10 9 7 10 8 10 10 8 11 10 +D 9 8 16 11 4 9 9 3 11 6 5 11 8 11 10 10 9 3 6 7 +E 10 9 11 16 6 6 11 4 11 7 8 10 5 11 9 9 8 4 7 6 +F 10 10 4 6 16 7 9 12 6 11 10 6 4 7 5 8 10 11 11 10 +G 8 8 9 6 7 16 7 6 7 6 4 10 11 8 7 11 10 6 8 10 +H 11 10 9 11 9 7 16 8 11 10 10 10 5 11 10 10 10 9 10 9 +I 9 8 3 4 12 6 8 16 4 10 9 5 3 6 4 8 10 12 11 10 +K 10 9 11 11 6 7 11 4 16 7 8 11 6 12 11 10 9 5 7 7 +L 11 11 6 7 11 6 10 10 7 16 11 7 4 9 6 8 9 10 11 9 +M 11 10 5 8 10 4 10 9 8 11 16 6 2 9 6 7 8 9 10 8 +N 9 9 11 10 6 10 10 5 11 7 6 16 9 11 10 11 10 5 8 8 +P 6 7 8 5 4 11 5 3 6 4 2 9 16 7 6 10 8 3 6 8 +Q 11 10 11 11 7 8 11 6 12 9 9 11 7 16 10 10 10 6 9 8 +R 8 8 10 9 5 7 10 4 11 6 6 10 6 10 16 9 9 5 7 7 +S 10 10 10 9 8 11 10 8 10 8 7 11 10 10 9 16 11 8 10 11 +T 10 10 9 8 10 10 10 10 9 9 8 10 8 10 9 11 16 10 11 11 +V 9 8 3 4 11 6 9 12 5 10 9 5 3 6 5 8 10 16 11 10 +W 11 11 6 7 11 8 10 11 7 11 10 8 6 9 7 10 11 11 16 11 +Y 9 10 7 6 10 10 9 10 7 9 8 8 8 8 7 11 11 10 11 16 diff --git a/code/lib/Bio/Align/substitution_matrices/data/RISLER b/code/lib/Bio/Align/substitution_matrices/data/RISLER new file mode 100644 index 0000000..438b601 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/RISLER @@ -0,0 +1,27 @@ +# J.L. Risler, M.O. Delorme, H. Delacroix, A. Henaut: +# "Amino acid substitutions in structurally related proteins. A pattern +# recognition approach. Determination of a new and efficient scoring matrix." +# Journal of Molecular Biology 204(4): 1019-1029 (1988). +# Figure 5. +# PMID 3221397 + A C D E F G H I K L M N P Q R S T V W Y +A 2.2 -1.5 0.2 1.7 0.6 0.6 -0.6 1.7 1.4 1.3 1.0 1.3 -0.2 1.8 1.5 2.0 1.9 2.0 -0.9 0.2 +C -1.5 2.2 -1.7 -1.5 -1.6 -1.7 -1.8 -1.6 -1.6 -1.5 -1.6 -1.6 -1.8 -1.4 -1.5 -1.3 -1.4 -1.4 -1.8 -1.1 +D 0.2 -1.7 2.2 1.0 -0.3 -0.4 -1.3 0.0 0.1 -0.2 -0.5 0.8 -1.2 0.6 -0.1 0.7 0.0 0.0 -1.4 -0.4 +E 1.7 -1.5 1.0 2.2 0.6 0.3 -0.6 1.5 1.4 0.9 0.6 1.4 -0.1 2.1 1.9 1.8 1.6 1.6 -1.0 0.2 +F 0.6 -1.6 -0.3 0.6 2.2 -0.4 -1.1 1.0 0.1 1.0 -0.2 0.4 -1.1 0.7 0.4 0.5 0.3 0.8 -0.9 2.0 +G 0.6 -1.7 -0.4 0.3 -0.4 2.2 -1.2 0.0 -0.1 -0.2 -0.4 0.2 -1.2 0.2 0.1 0.7 0.2 0.1 -1.3 -0.2 +H -0.6 -1.8 -1.3 -0.6 -1.1 -1.2 2.2 -0.8 -1.0 -0.9 -1.2 -0.3 -1.6 -0.5 -0.4 -0.4 -0.9 -0.7 -1.7 -0.8 +I 1.7 -1.6 0.0 1.5 1.0 0.0 -0.8 2.2 1.0 2.1 0.9 0.9 -0.6 1.4 1.4 1.6 1.6 2.2 -0.7 0.4 +K 1.4 -1.6 0.1 1.4 0.1 -0.1 -1.0 1.0 2.2 0.7 0.4 1.0 -0.7 1.7 2.1 1.4 1.2 1.2 -1.1 0.5 +L 1.3 -1.5 -0.2 0.9 1.0 -0.2 -0.9 2.1 0.7 2.2 1.8 0.8 -0.8 1.1 1.2 1.3 1.2 2.0 -0.8 0.5 +M 1.0 -1.6 -0.5 0.6 -0.2 -0.4 -1.2 0.9 0.4 1.8 2.2 0.0 -1.2 1.2 1.1 0.6 0.8 0.8 -1.3 -0.2 +N 1.3 -1.6 0.8 1.4 0.4 0.2 -0.3 0.9 1.0 0.8 0.0 2.2 -1.0 1.6 1.2 1.9 1.1 1.1 -1.1 -0.1 +P -0.2 -1.8 -1.2 -0.1 -1.1 -1.2 -1.6 -0.6 -0.7 -0.8 -1.2 -1.0 2.2 -0.6 -0.3 -0.3 -0.5 -0.6 -1.6 -1.2 +Q 1.8 -1.4 0.6 2.1 0.7 0.2 -0.5 1.4 1.7 1.1 1.2 1.6 -0.6 2.2 2.0 1.8 1.7 1.5 -1.0 0.5 +R 1.5 -1.5 -0.1 1.9 0.4 0.1 -0.4 1.4 2.1 1.2 1.1 1.2 -0.3 2.0 2.2 2.0 1.9 1.5 -0.8 0.8 +S 2.0 -1.3 0.7 1.8 0.5 0.7 -0.4 1.6 1.4 1.3 0.6 1.9 -0.3 1.8 2.0 2.2 2.1 1.8 -0.8 0.4 +T 1.9 -1.4 0.0 1.6 0.3 0.2 -0.9 1.6 1.2 1.2 0.8 1.1 -0.5 1.7 1.9 2.1 2.2 1.6 -1.0 0.3 +V 2.0 -1.4 0.0 1.6 0.8 0.1 -0.7 2.2 1.2 2.0 0.8 1.1 -0.6 1.5 1.5 1.8 1.6 2.2 -0.7 0.3 +W -0.9 -1.8 -1.4 -1.0 -0.9 -1.3 -1.7 -0.7 -1.1 -0.8 -1.3 -1.1 -1.6 -1.0 -0.8 -0.8 -1.0 -0.7 2.2 -0.6 +Y 0.2 -1.1 -0.4 0.2 2.0 -0.2 -0.8 0.4 0.5 0.5 -0.2 -0.1 -1.2 0.5 0.8 0.4 0.3 0.3 -0.6 2.2 diff --git a/code/lib/Bio/Align/substitution_matrices/data/SCHNEIDER b/code/lib/Bio/Align/substitution_matrices/data/SCHNEIDER new file mode 100644 index 0000000..0384fa9 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/SCHNEIDER @@ -0,0 +1,70 @@ +# Adrian Schneider, Gina M. Cannarozzi, and Gaston H. Gonnet: +# "Empirical codon substitution matrix." +# BMC Bioinformatics 6:134 (2005). +# Additional File 3. +# PMID 15927081 + AAA AAC AAG AAT ACA ACC ACG ACT AGA AGC AGG AGT ATA ATC ATG ATT CAA CAC CAG CAT CCA CCC CCG CCT CGA CGC CGG CGT CTA CTC CTG CTT GAA GAC GAG GAT GCA GCC GCG GCT GGA GGC GGG GGT GTA GTC GTG GTT TAA TAC TAG TAT TCA TCC TCG TCT TGA TGC TGG TGT TTA TTC TTG TTT +AAA 11.6 -2.7 9.7 -1.7 -2.7 -6.4 -3.9 -5.6 5.1 -5.0 3.6 -4.2 -6.3 -13.0 -7.1 -11.5 0.4 -6.0 -1.9 -5.3 -8.5 -11.2 -8.9 -10.8 2.1 0.0 1.4 0.2 -10.2 -13.5 -13.0 -12.5 -2.6 -8.5 -5.0 -8.1 -6.3 -9.9 -7.5 -9.0 -7.1 -10.2 -8.2 -9.2 -8.2 -12.5 -11.1 -11.4 -50.0 -14.8 -50.0 -13.8 -7.3 -10.1 -8.4 -9.1 -50.0 -13.0 -13.5 -12.4 -10.7 -18.1 -11.8 -17.2 +AAC -2.7 13.0 -3.3 10.9 -3.5 -0.4 -3.3 -1.8 -5.4 4.6 -5.5 3.0 -10.2 -7.9 -9.9 -9.6 -5.0 0.5 -5.5 -1.0 -10.3 -8.1 -9.4 -9.6 -8.1 -5.0 -7.3 -6.3 -13.4 -11.3 -14.4 -12.9 -6.3 0.8 -6.4 -1.1 -7.4 -5.0 -6.2 -6.5 -5.6 -1.6 -4.7 -3.0 -10.8 -8.7 -11.9 -10.0 -50.0 -6.2 -50.0 -7.5 -6.3 -4.3 -6.2 -5.4 -50.0 -7.0 -16.3 -8.2 -13.2 -12.3 -13.1 -13.3 +AAG 9.7 -3.3 11.6 -2.8 -4.5 -6.7 -3.1 -6.9 3.3 -5.5 4.8 -5.1 -8.9 -13.2 -5.7 -12.6 -1.5 -6.1 -0.6 -6.1 -10.0 -11.8 -8.6 -11.9 1.2 0.5 2.2 0.1 -11.8 -14.0 -11.9 -13.4 -4.9 -9.1 -3.4 -8.9 -8.0 -10.0 -7.2 -10.0 -9.1 -10.2 -7.1 -9.9 -10.1 -13.0 -10.6 -12.7 -50.0 -14.9 -50.0 -14.4 -8.9 -10.8 -8.9 -10.4 -50.0 -13.1 -11.8 -13.0 -12.4 -19.4 -11.5 -17.8 +AAT -1.7 10.9 -2.8 12.9 -2.7 -2.2 -2.8 0.2 -4.8 2.9 -5.2 5.2 -9.1 -9.5 -9.0 -7.0 -4.0 -1.0 -5.0 1.0 -9.0 -8.8 -8.9 -7.7 -7.0 -6.8 -7.1 -4.3 -12.4 -12.9 -13.9 -10.0 -4.9 -0.7 -5.6 1.4 -6.0 -6.5 -6.2 -4.8 -5.1 -3.2 -5.0 -1.2 -9.8 -9.9 -11.3 -8.2 -50.0 -7.6 -50.0 -5.1 -5.5 -5.8 -5.9 -4.2 -50.0 -8.3 -15.5 -5.9 -11.6 -14.2 -12.0 -11.6 +ACA -2.7 -3.5 -4.5 -2.7 11.7 9.0 10.6 9.6 -3.2 -0.7 -5.1 -0.4 0.7 -4.6 -0.4 -3.6 -4.8 -8.9 -6.5 -7.8 -1.4 -4.1 -2.9 -3.3 -8.0 -9.1 -8.0 -8.2 -6.0 -9.3 -7.7 -8.5 -6.4 -10.1 -7.5 -8.6 2.8 -0.7 0.9 0.1 -5.9 -7.3 -6.5 -6.5 -0.4 -3.9 -2.6 -3.1 -50.0 -14.6 -50.0 -12.6 2.9 0.1 1.8 1.0 -50.0 -9.2 -13.6 -7.7 -4.8 -12.2 -6.2 -11.0 +ACC -6.4 -0.4 -6.7 -2.2 9.0 12.3 9.8 9.6 -7.1 2.4 -7.2 0.3 -2.9 -1.2 -3.0 -3.3 -7.6 -6.7 -8.1 -7.9 -4.9 -1.9 -4.5 -3.8 -9.5 -7.8 -9.3 -8.9 -9.4 -7.1 -9.0 -9.0 -9.4 -7.5 -9.3 -9.2 -0.7 2.6 0.2 -0.1 -8.0 -4.6 -7.7 -6.1 -3.6 -1.0 -4.1 -3.1 -50.0 -11.6 -50.0 -11.6 0.2 2.3 0.4 0.5 -50.0 -6.5 -15.3 -7.8 -8.4 -9.5 -8.3 -11.1 +ACG -3.9 -3.3 -3.1 -2.8 10.6 9.8 12.2 9.8 -4.9 0.1 -3.6 -0.2 -1.7 -4.1 0.9 -3.5 -5.9 -8.9 -5.6 -8.5 -3.9 -3.9 -2.2 -4.5 -7.3 -7.5 -6.4 -8.5 -7.1 -8.2 -6.7 -7.8 -7.9 -9.1 -7.1 -9.4 0.9 -0.2 2.5 -0.3 -6.9 -6.5 -5.5 -6.8 -1.7 -3.4 -1.7 -2.8 -50.0 -13.0 -50.0 -11.5 1.5 0.7 2.7 0.6 -50.0 -8.9 -12.1 -7.4 -6.1 -11.5 -5.4 -10.4 +ACT -5.6 -1.8 -6.9 0.2 9.6 9.6 9.8 11.6 -6.6 0.9 -7.2 2.5 -2.3 -3.4 -2.3 -0.6 -6.5 -8.2 -8.0 -6.3 -3.7 -3.5 -3.9 -1.6 -9.9 -8.9 -9.6 -7.0 -8.6 -8.9 -8.8 -6.8 -8.6 -8.2 -8.8 -6.8 0.3 0.1 0.2 2.4 -7.4 -6.2 -6.9 -4.4 -2.9 -2.9 -3.5 -0.9 -50.0 -12.2 -50.0 -10.1 1.2 0.6 1.2 2.4 -50.0 -7.6 -16.1 -5.8 -7.2 -10.8 -7.4 -9.0 +AGA 5.1 -5.4 3.3 -4.8 -3.2 -7.1 -4.9 -6.6 13.3 -2.8 11.2 -1.9 -5.7 -12.5 -7.2 -11.6 -0.7 -4.8 -3.1 -4.3 -10.0 -11.6 -8.8 -11.7 10.5 7.7 9.1 8.5 -9.8 -12.7 -11.7 -11.8 -6.3 -11.4 -8.8 -11.0 -7.5 -10.3 -8.3 -9.9 -1.9 -6.7 -4.1 -6.4 -7.9 -12.5 -11.4 -11.7 -50.0 -14.0 -50.0 -13.2 -8.5 -10.4 -9.1 -9.6 -50.0 -9.8 -7.7 -8.8 -10.3 -17.9 -11.8 -16.0 +AGC -5.0 4.6 -5.5 2.9 -0.7 2.4 0.1 0.9 -2.8 12.8 -2.6 11.0 -8.4 -6.3 -8.3 -7.7 -6.1 -2.9 -6.5 -4.2 -8.5 -5.9 -7.5 -7.7 -5.5 -2.4 -5.5 -4.1 -12.9 -10.3 -12.7 -11.3 -7.4 -3.1 -7.4 -4.9 -4.4 -1.8 -3.5 -3.5 -1.6 3.0 -0.8 0.9 -8.4 -6.3 -9.6 -7.3 -50.0 -9.0 -50.0 -9.7 -2.3 -0.2 -1.2 -1.5 -50.0 -0.7 -13.3 -2.0 -11.8 -11.9 -12.3 -12.7 +AGG 3.6 -5.5 4.8 -5.2 -5.1 -7.2 -3.6 -7.2 11.2 -2.6 13.4 -2.1 -7.7 -12.5 -5.3 -11.9 -2.2 -4.4 -1.9 -4.8 -10.2 -11.5 -8.1 -12.0 9.3 8.2 10.0 8.0 -10.8 -11.9 -10.7 -12.4 -8.2 -11.8 -6.9 -11.5 -8.4 -9.9 -7.1 -10.6 -4.7 -6.8 -1.2 -7.0 -9.8 -11.6 -9.6 -12.1 -50.0 -14.1 -50.0 -12.7 -10.0 -10.8 -8.8 -10.5 -50.0 -9.2 -4.2 -9.3 -11.0 -18.2 -11.1 -16.1 +AGT -4.2 3.0 -5.1 5.2 -0.4 0.3 -0.2 2.5 -1.9 11.0 -2.1 13.2 -7.6 -8.5 -8.0 -5.3 -5.6 -4.5 -6.8 -2.4 -7.8 -7.9 -7.9 -6.2 -5.7 -5.1 -5.4 -2.1 -12.4 -12.3 -13.1 -10.1 -6.6 -4.4 -6.9 -2.6 -3.6 -3.8 -3.7 -2.0 -1.4 0.8 -0.9 3.0 -8.2 -8.1 -9.2 -5.8 -50.0 -10.1 -50.0 -7.4 -1.7 -2.0 -1.5 -0.5 -50.0 -2.1 -12.6 -0.4 -11.6 -13.9 -11.4 -11.1 +ATA -6.3 -10.2 -8.9 -9.1 0.7 -2.9 -1.7 -2.3 -5.7 -8.4 -7.7 -7.6 13.2 9.6 3.5 9.7 -8.7 -12.4 -10.7 -10.7 -7.8 -10.2 -8.8 -9.6 -9.5 -11.9 -10.9 -10.5 2.3 -0.4 -0.2 -0.3 -9.7 -15.5 -11.4 -13.7 -3.0 -6.4 -4.1 -5.4 -9.6 -11.9 -10.5 -11.8 6.2 3.3 3.7 3.6 -50.0 -13.6 -50.0 -11.9 -5.6 -8.8 -7.2 -8.9 -50.0 -12.4 -14.1 -11.6 2.8 -6.4 0.5 -5.2 +ATC -13.0 -7.9 -13.2 -9.5 -4.6 -1.2 -4.1 -3.4 -12.5 -6.3 -12.5 -8.5 9.6 12.7 0.2 10.5 -12.4 -11.6 -13.2 -12.0 -12.4 -10.1 -11.6 -12.0 -15.1 -13.1 -14.3 -13.4 -1.4 1.4 -1.6 -0.5 -14.9 -14.3 -15.2 -16.8 -7.7 -4.7 -6.3 -6.6 -14.6 -10.8 -12.8 -13.0 2.3 6.0 2.5 3.6 -50.0 -11.2 -50.0 -11.9 -10.5 -9.2 -10.5 -10.5 -50.0 -10.3 -16.2 -11.3 -1.7 -3.7 -2.5 -5.4 +ATG -7.1 -9.9 -5.7 -9.0 -0.4 -3.0 0.9 -2.3 -7.2 -8.3 -5.3 -8.0 3.5 0.2 14.3 1.0 -7.3 -11.2 -6.9 -9.7 -8.5 -10.2 -7.7 -9.8 -10.0 -10.4 -8.4 -10.2 1.0 -0.6 1.7 -0.5 -11.1 -14.8 -9.7 -14.5 -3.7 -5.8 -3.1 -5.6 -10.4 -11.5 -8.2 -11.1 0.6 -1.6 1.6 -1.1 -50.0 -12.4 -50.0 -11.5 -5.5 -8.0 -4.9 -7.2 -50.0 -12.4 -10.3 -11.6 0.7 -6.8 2.4 -6.0 +ATT -11.5 -9.6 -12.6 -7.0 -3.6 -3.3 -3.5 -0.6 -11.6 -7.7 -11.9 -5.3 9.7 10.5 1.0 12.6 -12.0 -12.2 -12.5 -9.7 -11.1 -11.2 -11.9 -9.4 -12.9 -14.3 -14.0 -11.9 -1.0 -0.4 -1.6 1.2 -13.9 -15.4 -13.7 -12.6 -6.2 -6.7 -6.3 -4.4 -13.0 -12.2 -13.0 -10.4 2.9 3.8 2.5 5.8 -50.0 -12.5 -50.0 -10.2 -9.6 -9.7 -9.6 -8.3 -50.0 -11.6 -15.4 -9.4 -1.1 -5.4 -1.6 -3.3 +CAA 0.4 -5.0 -1.5 -4.0 -4.8 -7.6 -5.9 -6.5 -0.7 -6.1 -2.2 -5.6 -8.7 -12.4 -7.3 -12.0 12.8 2.3 10.2 3.0 0.0 -3.4 -0.7 -3.2 2.5 -0.8 0.9 0.2 -3.0 -7.2 -5.7 -6.1 -0.2 -6.6 -1.8 -6.0 -5.3 -8.2 -5.7 -7.6 -6.7 -9.6 -7.5 -9.3 -7.1 -10.5 -9.6 -9.4 -50.0 -8.1 -50.0 -7.2 -4.3 -6.8 -5.7 -6.3 -50.0 -9.5 -9.6 -8.9 -6.2 -12.8 -6.8 -11.8 +CAC -6.0 0.5 -6.1 -1.0 -8.9 -6.7 -8.9 -8.2 -4.8 -2.9 -4.4 -4.5 -12.4 -11.6 -11.2 -12.2 2.3 14.6 1.9 12.9 -5.5 -2.6 -4.1 -4.7 -1.2 3.0 -1.1 1.6 -8.0 -4.7 -8.7 -6.0 -8.2 -4.7 -7.8 -6.7 -10.0 -8.3 -8.9 -9.9 -10.8 -7.1 -10.0 -9.2 -11.6 -10.1 -12.3 -11.9 -50.0 2.2 -50.0 0.9 -7.9 -5.9 -7.4 -6.9 -50.0 -4.6 -11.6 -5.6 -9.7 -5.9 -9.9 -7.3 +CAG -1.9 -5.5 -0.6 -5.0 -6.5 -8.1 -5.6 -8.0 -3.1 -6.5 -1.9 -6.8 -10.7 -13.2 -6.9 -12.5 10.2 1.9 11.9 2.1 -2.3 -4.1 -0.5 -4.6 -0.1 -0.9 2.4 -0.8 -5.1 -6.9 -4.8 -6.7 -2.3 -7.1 -0.8 -7.2 -7.2 -8.0 -5.6 -8.5 -9.4 -9.7 -7.2 -9.8 -9.2 -11.1 -9.5 -10.7 -50.0 -8.7 -50.0 -8.6 -6.1 -7.5 -5.5 -7.7 -50.0 -10.4 -7.7 -10.0 -7.3 -13.6 -6.4 -13.3 +CAT -5.3 -1.0 -6.1 1.0 -7.8 -7.9 -8.5 -6.3 -4.3 -4.2 -4.8 -2.4 -10.7 -12.0 -9.7 -9.7 3.0 12.9 2.1 14.7 -4.9 -3.8 -4.0 -2.4 -1.0 0.8 -1.3 3.4 -7.6 -5.7 -8.1 -3.5 -6.8 -5.8 -7.2 -4.2 -8.9 -10.2 -8.4 -8.2 -9.9 -8.4 -9.7 -6.4 -11.1 -11.1 -11.8 -10.0 -50.0 0.4 -50.0 2.6 -7.2 -7.0 -7.4 -5.1 -50.0 -5.7 -9.8 -3.2 -8.4 -7.4 -8.9 -5.9 +CCA -8.5 -10.3 -10.0 -9.0 -1.4 -4.9 -3.9 -3.7 -10.0 -8.5 -10.2 -7.8 -7.8 -12.4 -8.5 -11.1 0.0 -5.5 -2.3 -4.9 12.6 10.0 11.1 10.5 -6.0 -8.3 -6.9 -7.5 -2.2 -7.4 -5.3 -5.9 -8.9 -12.5 -9.4 -11.6 -0.7 -3.9 -2.3 -3.0 -9.1 -9.8 -9.2 -9.5 -5.6 -9.0 -8.0 -8.5 -50.0 -15.3 -50.0 -14.1 2.3 -1.4 0.3 -0.5 -50.0 -13.6 -14.3 -11.5 -5.3 -13.7 -6.1 -12.7 +CCC -11.2 -8.1 -11.8 -8.8 -4.1 -1.9 -3.9 -3.5 -11.6 -5.9 -11.5 -7.9 -10.2 -10.1 -10.2 -11.2 -3.4 -2.6 -4.1 -3.8 10.0 13.1 10.7 10.6 -8.1 -5.2 -8.0 -6.7 -6.1 -3.7 -7.5 -5.6 -11.1 -10.3 -10.6 -11.9 -3.2 -1.0 -2.2 -2.8 -10.5 -8.1 -9.4 -9.5 -8.3 -6.9 -9.4 -8.8 -50.0 -11.3 -50.0 -12.8 -0.6 2.2 -0.3 0.1 -50.0 -10.1 -17.5 -11.1 -8.6 -9.9 -8.6 -11.9 +CCG -8.9 -9.4 -8.6 -8.9 -2.9 -4.5 -2.2 -3.9 -8.8 -7.5 -8.1 -7.9 -8.8 -11.6 -7.7 -11.9 -0.7 -4.1 -0.5 -4.0 11.1 10.7 13.2 10.4 -5.7 -5.8 -3.5 -6.1 -3.6 -6.5 -3.3 -5.5 -9.4 -11.0 -8.7 -11.2 -1.8 -2.7 0.7 -3.0 -9.1 -8.2 -7.3 -9.4 -7.2 -8.2 -7.2 -8.2 -50.0 -13.2 -50.0 -13.3 0.5 -0.9 1.8 -0.6 -50.0 -11.4 -11.1 -10.5 -6.1 -12.9 -5.0 -11.9 +CCT -10.8 -9.6 -11.9 -7.7 -3.3 -3.8 -4.5 -1.6 -11.7 -7.7 -12.0 -6.2 -9.6 -12.0 -9.8 -9.4 -3.2 -4.7 -4.6 -2.4 10.5 10.6 10.4 12.6 -8.2 -7.9 -9.0 -4.7 -6.1 -6.0 -7.5 -2.9 -10.9 -12.2 -11.2 -10.9 -2.4 -2.9 -2.7 -0.9 -10.4 -9.7 -10.2 -8.0 -8.4 -9.1 -9.3 -6.5 -50.0 -13.3 -50.0 -11.0 -0.0 -0.4 -0.4 2.3 -50.0 -10.9 -17.2 -8.3 -8.3 -12.7 -7.8 -9.3 +CGA 2.1 -8.1 1.2 -7.0 -8.0 -9.5 -7.3 -9.9 10.5 -5.5 9.3 -5.7 -9.5 -15.1 -10.0 -12.9 2.5 -1.2 -0.1 -1.0 -6.0 -8.1 -5.7 -8.2 13.8 11.3 11.8 12.1 -6.0 -9.1 -9.0 -9.5 -8.9 -13.4 -10.0 -13.1 -9.9 -11.7 -9.8 -12.8 -5.9 -9.4 -6.5 -8.1 -10.7 -14.0 -13.3 -13.9 -50.0 -11.9 -50.0 -9.7 -8.9 -10.8 -9.0 -11.1 -50.0 -6.5 -5.3 -6.3 -10.5 -16.2 -10.4 -15.2 +CGC 0.0 -5.0 0.5 -6.8 -9.1 -7.8 -7.5 -8.9 7.7 -2.4 8.2 -5.1 -11.9 -13.1 -10.4 -14.3 -0.8 3.0 -0.9 0.8 -8.3 -5.2 -5.8 -7.9 11.3 15.0 11.2 12.8 -9.7 -5.3 -9.2 -7.8 -11.7 -10.6 -10.1 -12.2 -11.7 -8.8 -8.9 -11.8 -9.3 -5.0 -8.0 -8.2 -13.6 -11.4 -12.8 -13.0 -50.0 -7.2 -50.0 -8.4 -11.0 -7.7 -8.7 -10.2 -50.0 -1.5 -7.1 -4.9 -11.2 -11.6 -11.6 -15.0 +CGG 1.4 -7.3 2.2 -7.1 -8.0 -9.3 -6.4 -9.6 9.1 -5.5 10.0 -5.4 -10.9 -14.3 -8.4 -14.0 0.9 -1.1 2.4 -1.3 -6.9 -8.0 -3.5 -9.0 11.8 11.2 13.4 11.4 -7.3 -8.8 -6.5 -8.5 -10.0 -12.5 -7.8 -12.5 -9.6 -11.0 -7.1 -10.7 -7.9 -8.0 -4.3 -9.4 -11.4 -12.3 -10.9 -12.9 -50.0 -11.8 -50.0 -11.2 -9.4 -9.8 -7.3 -10.5 -50.0 -6.2 -2.2 -6.7 -9.8 -15.2 -8.5 -14.9 +CGT 0.2 -6.3 0.1 -4.3 -8.2 -8.9 -8.5 -7.0 8.5 -4.1 8.0 -2.1 -10.5 -13.4 -10.2 -11.9 0.2 1.6 -0.8 3.4 -7.5 -6.7 -6.1 -4.7 12.1 12.8 11.4 14.7 -8.7 -7.4 -9.4 -5.3 -9.9 -11.8 -10.5 -10.3 -9.9 -10.1 -9.3 -9.3 -8.1 -7.2 -8.0 -5.2 -12.0 -12.1 -12.6 -10.9 -50.0 -7.9 -50.0 -5.3 -9.3 -8.8 -8.9 -7.0 -50.0 -3.6 -7.1 -1.2 -10.2 -12.7 -9.9 -11.0 +CTA -10.2 -13.4 -11.8 -12.4 -6.0 -9.4 -7.1 -8.6 -9.8 -12.9 -10.8 -12.4 2.3 -1.4 1.0 -1.0 -3.0 -8.0 -5.1 -7.6 -2.2 -6.1 -3.6 -6.1 -6.0 -9.7 -7.3 -8.7 11.2 7.9 8.9 8.1 -12.1 -17.4 -13.2 -16.8 -6.3 -9.5 -7.5 -8.8 -12.7 -15.2 -13.1 -14.2 0.0 -3.1 -1.8 -3.1 -50.0 -10.7 -50.0 -9.3 -4.6 -8.9 -5.9 -8.3 -50.0 -12.8 -9.6 -10.6 9.5 -3.2 8.2 -2.8 +CTC -13.5 -11.3 -14.0 -12.9 -9.3 -7.1 -8.2 -8.9 -12.7 -10.3 -11.9 -12.3 -0.4 1.4 -0.6 -0.4 -7.2 -4.7 -6.9 -5.7 -7.4 -3.7 -6.5 -6.0 -9.1 -5.3 -8.8 -7.4 7.9 11.9 7.8 9.3 -15.2 -15.1 -14.6 -17.2 -9.7 -7.3 -8.2 -9.2 -15.0 -12.8 -14.1 -14.9 -2.8 0.2 -2.9 -2.3 -50.0 -7.4 -50.0 -8.7 -9.4 -7.3 -9.4 -9.4 -50.0 -8.0 -11.9 -9.7 6.5 0.5 6.4 -1.8 +CTG -13.0 -14.4 -11.9 -13.9 -7.7 -9.0 -6.7 -8.8 -11.7 -12.7 -10.7 -13.1 -0.2 -1.6 1.7 -1.6 -5.7 -8.7 -4.8 -8.1 -5.3 -7.5 -3.3 -7.5 -9.0 -9.2 -6.5 -9.4 8.9 7.8 10.1 7.8 -14.3 -17.7 -13.0 -17.8 -8.0 -9.4 -6.2 -9.2 -15.1 -14.9 -12.2 -14.9 -1.8 -3.2 -0.8 -3.2 -50.0 -10.8 -50.0 -9.9 -7.3 -9.6 -6.4 -9.4 -50.0 -12.3 -8.7 -11.3 7.6 -3.3 8.7 -2.9 +CTT -12.5 -12.9 -13.4 -10.0 -8.5 -9.0 -7.8 -6.8 -11.8 -11.3 -12.4 -10.1 -0.3 -0.5 -0.5 1.2 -6.1 -6.0 -6.7 -3.5 -5.9 -5.6 -5.5 -2.9 -9.5 -7.8 -8.5 -5.3 8.1 9.3 7.8 11.8 -14.4 -15.7 -14.5 -14.6 -8.4 -8.7 -7.9 -7.1 -14.3 -13.8 -13.8 -12.3 -2.4 -2.2 -2.9 -0.2 -50.0 -8.9 -50.0 -7.1 -8.1 -9.1 -8.7 -6.6 -50.0 -9.7 -11.4 -8.0 6.8 -1.8 6.9 0.4 +GAA -2.6 -6.3 -4.9 -4.9 -6.4 -9.4 -7.9 -8.6 -6.3 -7.4 -8.2 -6.6 -9.7 -14.9 -11.1 -13.9 -0.2 -8.2 -2.3 -6.8 -8.9 -11.1 -9.4 -10.9 -8.9 -11.7 -10.0 -9.9 -12.1 -15.2 -14.3 -14.4 11.1 2.9 9.2 3.5 -3.0 -7.0 -4.3 -6.2 -2.2 -6.6 -3.7 -5.7 -6.0 -10.4 -8.6 -9.5 -50.0 -15.5 -50.0 -13.9 -7.8 -10.5 -8.7 -9.6 -50.0 -16.5 -17.1 -14.8 -12.1 -18.9 -12.4 -17.4 +GAC -8.5 0.8 -9.1 -0.7 -10.1 -7.5 -9.1 -8.2 -11.4 -3.1 -11.8 -4.4 -15.5 -14.3 -14.8 -15.4 -6.6 -4.7 -7.1 -5.8 -12.5 -10.3 -11.0 -12.2 -13.4 -10.6 -12.5 -11.8 -17.4 -15.1 -17.7 -15.7 2.9 12.5 3.1 10.3 -7.7 -5.2 -6.2 -7.3 -5.4 -1.7 -5.4 -3.7 -11.6 -9.1 -13.0 -10.8 -50.0 -10.3 -50.0 -11.2 -9.8 -8.3 -9.1 -9.7 -50.0 -12.7 -20.6 -13.1 -16.0 -16.6 -17.0 -18.4 +GAG -5.0 -6.4 -3.4 -5.6 -7.5 -9.3 -7.1 -8.8 -8.8 -7.4 -6.9 -6.9 -11.4 -15.2 -9.7 -13.7 -1.8 -7.8 -0.8 -7.2 -9.4 -10.6 -8.7 -11.2 -10.0 -10.1 -7.8 -10.5 -13.2 -14.6 -13.0 -14.5 9.2 3.1 10.8 3.0 -4.5 -6.7 -2.7 -6.7 -5.0 -6.8 -2.5 -6.7 -7.5 -10.4 -7.4 -10.2 -50.0 -15.5 -50.0 -14.2 -8.9 -10.2 -8.5 -10.0 -50.0 -15.6 -15.3 -15.5 -13.0 -18.8 -12.6 -18.3 +GAT -8.1 -1.1 -8.9 1.4 -8.6 -9.2 -9.4 -6.8 -11.0 -4.9 -11.5 -2.6 -13.7 -16.8 -14.5 -12.6 -6.0 -6.7 -7.2 -4.2 -11.6 -11.9 -11.2 -10.9 -13.1 -12.2 -12.5 -10.3 -16.8 -17.2 -17.8 -14.6 3.5 10.3 3.0 12.4 -6.9 -7.5 -6.8 -4.9 -5.4 -4.2 -5.4 -1.3 -10.8 -11.5 -12.4 -8.2 -50.0 -12.5 -50.0 -8.9 -9.2 -10.0 -9.4 -7.9 -50.0 -14.3 -19.9 -11.4 -16.2 -19.2 -15.8 -15.9 +GCA -6.3 -7.4 -8.0 -6.0 2.8 -0.7 0.9 0.3 -7.5 -4.4 -8.4 -3.6 -3.0 -7.7 -3.7 -6.2 -5.3 -10.0 -7.2 -8.9 -0.7 -3.2 -1.8 -2.4 -9.9 -11.7 -9.6 -9.9 -6.3 -9.7 -8.0 -8.4 -3.0 -7.7 -4.5 -6.9 11.3 8.2 9.4 9.1 -1.0 -3.3 -1.8 -2.4 1.5 -2.1 -0.4 -1.1 -50.0 -14.5 -50.0 -12.9 2.8 -0.0 1.4 0.7 -50.0 -8.7 -14.1 -7.5 -5.2 -12.5 -6.5 -10.7 +GCC -9.9 -5.0 -10.0 -6.5 -0.7 2.6 -0.2 0.1 -10.3 -1.8 -9.9 -3.8 -6.4 -4.7 -5.8 -6.7 -8.2 -8.3 -8.0 -10.2 -3.9 -1.0 -2.7 -2.9 -11.7 -8.8 -11.0 -10.1 -9.5 -7.3 -9.4 -8.7 -7.0 -5.2 -6.7 -7.5 8.2 11.6 8.8 9.1 -3.7 -0.9 -3.1 -3.0 -2.0 1.2 -2.3 -1.3 -50.0 -12.1 -50.0 -12.6 0.1 2.5 0.7 0.6 -50.0 -6.0 -14.8 -7.5 -8.6 -9.4 -8.5 -11.2 +GCG -7.5 -6.2 -7.2 -6.2 0.9 0.2 2.5 0.2 -8.3 -3.5 -7.1 -3.7 -4.1 -6.3 -3.1 -6.3 -5.7 -8.9 -5.6 -8.4 -2.3 -2.2 0.7 -2.7 -9.8 -8.9 -7.1 -9.3 -7.5 -8.2 -6.2 -7.9 -4.3 -6.2 -2.7 -6.8 9.4 8.8 12.1 8.8 -2.1 -2.0 0.1 -2.4 0.0 -1.1 1.3 -0.8 -50.0 -11.9 -50.0 -12.6 1.2 0.7 3.5 0.8 -50.0 -7.1 -12.4 -7.1 -7.1 -10.6 -5.0 -10.6 +GCT -9.0 -6.5 -10.0 -4.8 0.1 -0.1 -0.3 2.4 -9.9 -3.5 -10.6 -2.0 -5.4 -6.6 -5.6 -4.4 -7.6 -9.9 -8.5 -8.2 -3.0 -2.8 -3.0 -0.9 -12.8 -11.8 -10.7 -9.3 -8.8 -9.2 -9.2 -7.1 -6.2 -7.3 -6.7 -4.9 9.1 9.1 8.8 11.2 -3.1 -3.0 -3.3 -1.0 -1.1 -1.4 -2.0 1.2 -50.0 -12.8 -50.0 -10.9 0.8 0.5 0.7 2.3 -50.0 -7.5 -16.5 -5.9 -7.8 -11.1 -7.8 -9.0 +GGA -7.1 -5.6 -9.1 -5.1 -5.9 -8.0 -6.9 -7.4 -1.9 -1.6 -4.7 -1.4 -9.6 -14.6 -10.4 -13.0 -6.7 -10.8 -9.4 -9.9 -9.1 -10.5 -9.1 -10.4 -5.9 -9.3 -7.9 -8.1 -12.7 -15.0 -15.1 -14.3 -2.2 -5.4 -5.0 -5.4 -1.0 -3.7 -2.1 -3.1 12.8 9.6 11.1 10.1 -4.8 -9.4 -8.1 -8.6 -50.0 -18.3 -50.0 -15.4 -5.9 -7.5 -6.6 -7.1 -50.0 -9.5 -11.7 -8.4 -12.5 -17.3 -13.2 -15.4 +GGC -10.2 -1.6 -10.2 -3.2 -7.3 -4.6 -6.5 -6.2 -6.7 3.0 -6.8 0.8 -11.9 -10.8 -11.5 -12.2 -9.6 -7.1 -9.7 -8.4 -9.8 -8.1 -8.2 -9.7 -9.4 -5.0 -8.0 -7.2 -15.2 -12.8 -14.9 -13.8 -6.6 -1.7 -6.8 -4.2 -3.3 -0.9 -2.0 -3.0 9.6 12.8 9.7 10.5 -8.5 -5.6 -9.3 -7.8 -50.0 -12.7 -50.0 -13.8 -6.8 -5.1 -5.3 -6.5 -50.0 -4.1 -12.8 -5.8 -14.4 -14.1 -13.6 -14.8 +GGG -8.2 -4.7 -7.1 -5.0 -6.5 -7.7 -5.5 -6.9 -4.1 -0.8 -1.2 -0.9 -10.5 -12.8 -8.2 -13.0 -7.5 -10.0 -7.2 -9.7 -9.2 -9.4 -7.3 -10.2 -6.5 -8.0 -4.3 -8.0 -13.1 -14.1 -12.2 -13.8 -3.7 -5.4 -2.5 -5.4 -1.8 -3.1 0.1 -3.3 11.1 9.7 12.9 9.8 -5.5 -8.1 -5.4 -7.8 -50.0 -16.4 -50.0 -14.8 -5.9 -7.1 -4.7 -7.2 -50.0 -8.4 -6.4 -8.1 -12.1 -16.2 -10.3 -15.8 +GGT -9.2 -3.0 -9.9 -1.2 -6.5 -6.1 -6.8 -4.4 -6.4 0.9 -7.0 3.0 -11.8 -13.0 -11.1 -10.4 -9.3 -9.2 -9.8 -6.4 -9.5 -9.5 -9.4 -8.0 -8.1 -8.2 -9.4 -5.2 -14.2 -14.9 -14.9 -12.3 -5.7 -3.7 -6.7 -1.3 -2.4 -3.0 -2.4 -1.0 10.1 10.5 9.8 13.1 -7.2 -8.0 -8.4 -5.0 -50.0 -14.7 -50.0 -11.4 -6.3 -6.7 -5.8 -5.0 -50.0 -6.1 -13.4 -3.6 -13.0 -16.0 -13.7 -13.3 +GTA -8.2 -10.8 -10.1 -9.8 -0.4 -3.6 -1.7 -2.9 -7.9 -8.4 -9.8 -8.2 6.2 2.3 0.6 2.9 -7.1 -11.6 -9.2 -11.1 -5.6 -8.3 -7.2 -8.4 -10.7 -13.6 -11.4 -12.0 0.0 -2.8 -1.8 -2.4 -6.0 -11.6 -7.5 -10.8 1.5 -2.0 0.0 -1.1 -4.8 -8.5 -5.5 -7.2 11.9 8.6 10.0 9.0 -50.0 -14.2 -50.0 -12.8 -4.1 -7.4 -5.7 -6.5 -50.0 -11.0 -14.3 -9.9 1.4 -7.9 -0.7 -6.8 +GTC -12.5 -8.7 -13.0 -9.9 -3.9 -1.0 -3.4 -2.9 -12.5 -6.3 -11.6 -8.1 3.3 6.0 -1.6 3.8 -10.5 -10.1 -11.1 -11.1 -9.0 -6.9 -8.2 -9.1 -14.0 -11.4 -12.3 -12.1 -3.1 0.2 -3.2 -2.2 -10.4 -9.1 -10.4 -11.5 -2.1 1.2 -1.1 -1.4 -9.4 -5.6 -8.1 -8.0 8.6 12.4 8.6 9.5 -50.0 -10.4 -50.0 -11.6 -7.2 -5.7 -7.5 -7.4 -50.0 -8.6 -15.7 -9.7 -3.0 -3.7 -3.2 -5.9 +GTG -11.1 -11.9 -10.6 -11.3 -2.6 -4.1 -1.7 -3.5 -11.4 -9.6 -9.6 -9.2 3.7 2.5 1.6 2.5 -9.6 -12.3 -9.5 -11.8 -8.0 -9.4 -7.2 -9.3 -13.3 -12.8 -10.9 -12.6 -1.8 -2.9 -0.8 -2.9 -8.6 -13.0 -7.4 -12.4 -0.4 -2.3 1.3 -2.0 -8.1 -9.3 -5.4 -8.4 10.0 8.6 11.4 8.9 -50.0 -13.7 -50.0 -13.2 -6.1 -8.1 -5.5 -7.5 -50.0 -11.3 -12.7 -10.1 -1.6 -7.8 -0.3 -7.2 +GTT -11.4 -10.0 -12.7 -8.2 -3.1 -3.1 -2.8 -0.9 -11.7 -7.3 -12.1 -5.8 3.6 3.6 -1.1 5.8 -9.4 -11.9 -10.7 -10.0 -8.5 -8.8 -8.2 -6.5 -13.9 -13.0 -12.9 -10.9 -3.1 -2.3 -3.2 -0.2 -9.5 -10.8 -10.2 -8.2 -1.1 -1.3 -0.8 1.2 -8.6 -7.8 -7.8 -5.0 9.0 9.5 8.9 12.0 -50.0 -13.0 -50.0 -10.3 -6.2 -7.0 -7.1 -5.4 -50.0 -9.3 -14.0 -8.2 -2.4 -6.3 -2.4 -3.9 +TAA -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 33.3 -50.0 30.6 -50.0 -50.0 -50.0 -50.0 -50.0 29.2 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 +TAC -14.8 -6.2 -14.9 -7.6 -14.6 -11.6 -13.0 -12.2 -14.0 -9.0 -14.1 -10.1 -13.6 -11.2 -12.4 -12.5 -8.1 2.2 -8.7 0.4 -15.3 -11.3 -13.2 -13.3 -11.9 -7.2 -11.8 -7.9 -10.7 -7.4 -10.8 -8.9 -15.5 -10.3 -15.5 -12.5 -14.5 -12.1 -11.9 -12.8 -18.3 -12.7 -16.4 -14.7 -14.2 -10.4 -13.7 -13.0 -50.0 15.1 -50.0 13.3 -9.9 -5.8 -8.9 -7.6 -50.0 -1.5 -7.8 -3.0 -8.1 3.6 -9.4 2.0 +TAG -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 30.6 -50.0 35.2 -50.0 -50.0 -50.0 -50.0 -50.0 28.5 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 +TAT -13.8 -7.5 -14.4 -5.1 -12.6 -11.6 -11.5 -10.1 -13.2 -9.7 -12.7 -7.4 -11.9 -11.9 -11.5 -10.2 -7.2 0.9 -8.6 2.6 -14.1 -12.8 -13.3 -11.0 -9.7 -8.4 -11.2 -5.3 -9.3 -8.7 -9.9 -7.1 -13.9 -11.2 -14.2 -8.9 -12.9 -12.6 -12.6 -10.9 -15.4 -13.8 -14.8 -11.4 -12.8 -11.6 -13.2 -10.3 -50.0 13.3 -50.0 15.2 -8.6 -7.0 -8.4 -4.8 -50.0 -3.0 -7.1 -0.1 -7.3 2.3 -8.7 3.9 +TCA -7.3 -6.3 -8.9 -5.5 2.9 0.2 1.5 1.2 -8.5 -2.3 -10.0 -1.7 -5.6 -10.5 -5.5 -9.6 -4.3 -7.9 -6.1 -7.2 2.3 -0.6 0.5 -0.0 -8.9 -11.0 -9.4 -9.3 -4.6 -9.4 -7.3 -8.1 -7.8 -9.8 -8.9 -9.2 2.8 0.1 1.2 0.8 -5.9 -6.8 -5.9 -6.3 -4.1 -7.2 -6.1 -6.2 -50.0 -9.9 -50.0 -8.6 12.5 9.4 11.0 9.8 -50.0 -4.8 -8.7 -3.5 -0.0 -8.6 -2.1 -7.0 +TCC -10.1 -4.3 -10.8 -5.8 0.1 2.3 0.7 0.6 -10.4 -0.2 -10.8 -2.0 -8.8 -9.2 -8.0 -9.7 -6.8 -5.9 -7.5 -7.0 -1.4 2.2 -0.9 -0.4 -10.8 -7.7 -9.8 -8.8 -8.9 -7.3 -9.6 -9.1 -10.5 -8.3 -10.2 -10.0 -0.0 2.5 0.7 0.5 -7.5 -5.1 -7.1 -6.7 -7.4 -5.7 -8.1 -7.0 -50.0 -5.8 -50.0 -7.0 9.4 12.7 10.1 10.0 -50.0 -1.5 -11.3 -3.3 -5.4 -4.2 -5.6 -6.8 +TCG -8.4 -6.2 -8.9 -5.9 1.8 0.4 2.7 1.2 -9.1 -1.2 -8.8 -1.5 -7.2 -10.5 -4.9 -9.6 -5.7 -7.4 -5.5 -7.4 0.3 -0.3 1.8 -0.4 -9.0 -8.7 -7.3 -8.9 -5.9 -9.4 -6.4 -8.7 -8.7 -9.1 -8.5 -9.4 1.4 0.7 3.5 0.7 -6.6 -5.3 -4.7 -5.8 -5.7 -7.5 -5.5 -7.1 -50.0 -8.9 -50.0 -8.4 11.0 10.1 13.2 10.2 -50.0 -4.2 -6.1 -3.6 -3.5 -8.4 -1.2 -6.8 +TCT -9.1 -5.4 -10.4 -4.2 1.0 0.5 0.6 2.4 -9.6 -1.5 -10.5 -0.5 -8.9 -10.5 -7.2 -8.3 -6.3 -6.9 -7.7 -5.1 -0.5 0.1 -0.6 2.3 -11.1 -10.2 -10.5 -7.0 -8.3 -9.4 -9.4 -6.6 -9.6 -9.7 -10.0 -7.9 0.7 0.6 0.8 2.3 -7.1 -6.5 -7.2 -5.0 -6.5 -7.4 -7.5 -5.4 -50.0 -7.6 -50.0 -4.8 9.8 10.0 10.2 12.1 -50.0 -3.1 -11.4 -0.7 -4.6 -6.6 -4.9 -3.9 +TGA -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 29.2 -50.0 28.5 -50.0 -50.0 -50.0 -50.0 -50.0 33.3 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 +TGC -13.0 -7.0 -13.1 -8.3 -9.2 -6.5 -8.9 -7.6 -9.8 -0.7 -9.2 -2.1 -12.4 -10.3 -12.4 -11.6 -9.5 -4.6 -10.4 -5.7 -13.6 -10.1 -11.4 -10.9 -6.5 -1.5 -6.2 -3.6 -12.8 -8.0 -12.3 -9.7 -16.5 -12.7 -15.6 -14.3 -8.7 -6.0 -7.1 -7.5 -9.5 -4.1 -8.4 -6.1 -11.0 -8.6 -11.3 -9.3 -50.0 -1.5 -50.0 -3.0 -4.8 -1.5 -4.2 -3.1 -50.0 16.4 -5.1 14.2 -10.1 -4.3 -10.0 -5.4 +TGG -13.5 -16.3 -11.8 -15.5 -13.6 -15.3 -12.1 -16.1 -7.7 -13.3 -4.2 -12.6 -14.1 -16.2 -10.3 -15.4 -9.6 -11.6 -7.7 -9.8 -14.3 -17.5 -11.1 -17.2 -5.3 -7.1 -2.2 -7.1 -9.6 -11.9 -8.7 -11.4 -17.1 -20.6 -15.3 -19.9 -14.1 -14.8 -12.4 -16.5 -11.7 -12.8 -6.4 -13.4 -14.3 -15.7 -12.7 -14.0 -50.0 -7.8 -50.0 -7.1 -8.7 -11.3 -6.1 -11.4 -50.0 -5.1 18.6 -4.8 -8.6 -8.1 -4.5 -7.3 +TGT -12.4 -8.2 -13.0 -5.9 -7.7 -7.8 -7.4 -5.8 -8.8 -2.0 -9.3 -0.4 -11.6 -11.3 -11.6 -9.4 -8.9 -5.6 -10.0 -3.2 -11.5 -11.1 -10.5 -8.3 -6.3 -4.9 -6.7 -1.2 -10.6 -9.7 -11.3 -8.0 -14.8 -13.1 -15.5 -11.4 -7.5 -7.5 -7.1 -5.9 -8.4 -5.8 -8.1 -3.6 -9.9 -9.7 -10.1 -8.2 -50.0 -3.0 -50.0 -0.1 -3.5 -3.3 -3.6 -0.7 -50.0 14.2 -4.8 16.4 -8.9 -5.8 -8.9 -3.4 +TTA -10.7 -13.2 -12.4 -11.6 -4.8 -8.4 -6.1 -7.2 -10.3 -11.8 -11.0 -11.6 2.8 -1.7 0.7 -1.1 -6.2 -9.7 -7.3 -8.4 -5.3 -8.6 -6.1 -8.3 -10.5 -11.2 -9.8 -10.2 9.5 6.5 7.6 6.8 -12.1 -16.0 -13.0 -16.2 -5.2 -8.6 -7.1 -7.8 -12.5 -14.4 -12.1 -13.0 1.4 -3.0 -1.6 -2.4 -50.0 -8.1 -50.0 -7.3 -0.0 -5.4 -3.5 -4.6 -50.0 -10.1 -8.6 -8.9 13.2 -0.9 9.7 -0.3 +TTC -18.1 -12.3 -19.4 -14.2 -12.2 -9.5 -11.5 -10.8 -17.9 -11.9 -18.2 -13.9 -6.4 -3.7 -6.8 -5.4 -12.8 -5.9 -13.6 -7.4 -13.7 -9.9 -12.9 -12.7 -16.2 -11.6 -15.2 -12.7 -3.2 0.5 -3.3 -1.8 -18.9 -16.6 -18.8 -19.2 -12.5 -9.4 -10.6 -11.1 -17.3 -14.1 -16.2 -16.0 -7.9 -3.7 -7.8 -6.3 -50.0 3.6 -50.0 2.3 -8.6 -4.2 -8.4 -6.6 -50.0 -4.3 -8.1 -5.8 -0.9 14.2 -1.8 11.6 +TTG -11.8 -13.1 -11.5 -12.0 -6.2 -8.3 -5.4 -7.4 -11.8 -12.3 -11.1 -11.4 0.5 -2.5 2.4 -1.6 -6.8 -9.9 -6.4 -8.9 -6.1 -8.6 -5.0 -7.8 -10.4 -11.6 -8.5 -9.9 8.2 6.4 8.7 6.9 -12.4 -17.0 -12.6 -15.8 -6.5 -8.5 -5.0 -7.8 -13.2 -13.6 -10.3 -13.7 -0.7 -3.2 -0.3 -2.4 -50.0 -9.4 -50.0 -8.7 -2.1 -5.6 -1.2 -4.9 -50.0 -10.0 -4.5 -8.9 9.7 -1.8 11.3 -0.9 +TTT -17.2 -13.3 -17.8 -11.6 -11.0 -11.1 -10.4 -9.0 -16.0 -12.7 -16.1 -11.1 -5.2 -5.4 -6.0 -3.3 -11.8 -7.3 -13.3 -5.9 -12.7 -11.9 -11.9 -9.3 -15.2 -15.0 -14.9 -11.0 -2.8 -1.8 -2.9 0.4 -17.4 -18.4 -18.3 -15.9 -10.7 -11.2 -10.6 -9.0 -15.4 -14.8 -15.8 -13.3 -6.8 -5.9 -7.2 -3.9 -50.0 2.0 -50.0 3.9 -7.0 -6.8 -6.8 -3.9 -50.0 -5.4 -7.3 -3.4 -0.3 11.6 -0.9 14.1 diff --git a/code/lib/Bio/Align/substitution_matrices/data/STR b/code/lib/Bio/Align/substitution_matrices/data/STR new file mode 100644 index 0000000..23189c3 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/STR @@ -0,0 +1,26 @@ +# Steven Henikoff and Jorja G. Henikoff: +# "Performance evaluation of amino acid substitution matrices." +# Proteins: Structure, Function, and Genetics: 17(1): 49-61 (1993). +# Figure 1, lower triangle. +# PMID 8234244 + A C D E F G H I K L M N P Q R S T V W Y +A 4 -2 -1 0 -3 0 -2 -2 -1 -2 0 -1 -1 0 -1 0 -1 0 -3 -3 +C -2 11 -7 -3 -2 -6 -6 -4 -4 -6 -5 -6 -8 -3 -2 -4 -5 -4 -6 -6 +D -1 -7 6 2 -5 -1 0 -3 -1 -6 -4 2 -1 0 -2 0 -1 -4 -6 -3 +E 0 -3 2 5 -4 -2 -2 -3 1 -4 -2 0 -1 2 0 -1 0 -2 -6 -2 +F -3 -2 -5 -4 7 -6 -2 1 -3 2 0 -3 -5 -4 -4 -3 -3 -1 2 3 +G 0 -6 -1 -2 -6 5 -3 -5 -3 -5 -4 -1 -2 -2 -2 -1 -3 -4 -4 -3 +H -2 -6 0 -2 -2 -3 8 -5 0 -3 -2 2 -3 0 0 -2 -2 -2 -3 0 +I -2 -4 -3 -3 1 -5 -5 6 -3 2 1 -3 -4 -5 -3 -3 -2 2 -2 -1 +K -1 -4 -1 1 -3 -3 0 -3 5 -2 -1 0 -1 1 2 -1 0 -3 -3 -2 +L -2 -6 -6 -4 2 -5 -3 2 -2 5 3 -3 -3 -3 -3 -4 -3 1 -1 -2 +M 0 -5 -4 -2 0 -4 -2 1 -1 3 8 -2 -6 1 -4 -4 -2 0 -2 -1 +N -1 -6 2 0 -3 -1 2 -3 0 -3 -2 5 -2 0 -1 0 0 -4 -5 -1 +P -1 -8 -1 -1 -5 -2 -3 -4 -1 -3 -6 -2 7 -2 -2 -1 -1 -4 -4 -6 +Q 0 -3 0 2 -4 -2 0 -5 1 -3 1 0 -2 6 1 -1 0 -2 -5 -3 +R -1 -2 -2 0 -4 -2 0 -3 2 -3 -4 -1 -2 1 7 0 -1 -3 -2 -1 +S 0 -4 0 -1 -3 -1 -2 -3 -1 -4 -4 0 -1 -1 0 4 1 -3 -5 -2 +T -1 -5 -1 0 -3 -3 -2 -2 0 -3 -2 0 -1 0 -1 1 5 -1 -5 -2 +V 0 -4 -4 -2 -1 -4 -2 2 -3 1 0 -4 -4 -2 -3 -3 -1 5 -4 -1 +W -3 -6 -6 -6 2 -4 -3 -2 -3 -1 -2 -5 -4 -5 -2 -5 -5 -4 10 2 +Y -3 -6 -3 -2 3 -3 0 -1 -2 -2 -1 -1 -6 -3 -1 -2 -2 -1 2 7 diff --git a/code/lib/Bio/Align/substitution_matrices/data/TRANS b/code/lib/Bio/Align/substitution_matrices/data/TRANS new file mode 100644 index 0000000..611e6b9 --- /dev/null +++ b/code/lib/Bio/Align/substitution_matrices/data/TRANS @@ -0,0 +1,12 @@ +# David Wheeler, +# Department of Cell Biology, Baylor College of Medicine, Houston, Texas: +# "Weight matrices for sequence similarity scoring." +# Version 2.0, May 1996. +# David Wheeler defined the Transition/Transversion Matrix as a penalty +# matrix; the matrix below is a similarity matrix where +# similarity = 5 - penalty. + A T C G +A 5 0 0 4 +T 0 5 4 0 +C 0 4 5 0 +G 4 0 0 5 diff --git a/code/lib/Bio/AlignIO/ClustalIO.py b/code/lib/Bio/AlignIO/ClustalIO.py new file mode 100644 index 0000000..49fc51a --- /dev/null +++ b/code/lib/Bio/AlignIO/ClustalIO.py @@ -0,0 +1,305 @@ +# Copyright 2006-2016 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools. + +You are expected to use this module via the Bio.AlignIO functions (or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). +""" +from Bio.Align import MultipleSeqAlignment +from Bio.AlignIO.Interfaces import AlignmentIterator +from Bio.AlignIO.Interfaces import SequentialAlignmentWriter +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + + +class ClustalWriter(SequentialAlignmentWriter): + """Clustalw alignment writer.""" + + def write_alignment(self, alignment): + """Use this to write (another) single alignment to an open file.""" + if len(alignment) == 0: + raise ValueError("Must have at least one sequence") + if alignment.get_alignment_length() == 0: + # This doubles as a check for an alignment object + raise ValueError("Non-empty sequences are required") + + # Old versions of the parser in Bio.Clustalw used a ._version property + try: + version = str(alignment._version) + except AttributeError: + version = "" + if not version: + version = "1.81" + if version.startswith("2."): + # e.g. 2.0.x + output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version + else: + # e.g. 1.81 or 1.83 + output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version + + cur_char = 0 + max_length = len(alignment[0]) + + if max_length <= 0: + raise ValueError("Non-empty sequences are required") + + if "clustal_consensus" in alignment.column_annotations: + star_info = alignment.column_annotations["clustal_consensus"] + else: + try: + # This was originally stored by Bio.Clustalw as ._star_info + star_info = alignment._star_info + except AttributeError: + star_info = None + + # keep displaying sequences until we reach the end + while cur_char != max_length: + # calculate the number of sequences to show, which will + # be less if we are at the end of the sequence + if (cur_char + 50) > max_length: + show_num = max_length - cur_char + else: + show_num = 50 + + # go through all of the records and print out the sequences + # when we output, we do a nice 80 column output, although this + # may result in truncation of the ids. + for record in alignment: + # Make sure we don't get any spaces in the record + # identifier when output in the file by replacing + # them with underscores: + line = record.id[0:30].replace(" ", "_").ljust(36) + line += str(record.seq[cur_char : (cur_char + show_num)]) + output += line + "\n" + + # now we need to print out the star info, if we've got it + if star_info: + output += ( + (" " * 36) + star_info[cur_char : (cur_char + show_num)] + "\n" + ) + + output += "\n" + cur_char += show_num + + # Want a trailing blank new line in case the output is concatenated + self.handle.write(output + "\n") + + +class ClustalIterator(AlignmentIterator): + """Clustalw alignment iterator.""" + + _header = None # for caching lines between __next__ calls + + def __next__(self): + """Parse the next alignment from the handle.""" + handle = self.handle + + if self._header is None: + line = handle.readline() + else: + # Header we saved from when we were parsing + # the previous alignment. + line = self._header + self._header = None + + if not line: + raise StopIteration + + # Whitelisted headers we know about + known_headers = ["CLUSTAL", "PROBCONS", "MUSCLE", "MSAPROBS", "Kalign"] + if line.strip().split()[0] not in known_headers: + raise ValueError( + "%s is not a known CLUSTAL header: %s" + % (line.strip().split()[0], ", ".join(known_headers)) + ) + + # find the clustal version in the header line + version = None + for word in line.split(): + if word[0] == "(" and word[-1] == ")": + word = word[1:-1] + if word[0] in "0123456789": + version = word + break + + # There should be two blank lines after the header line + line = handle.readline() + while line.strip() == "": + line = handle.readline() + + # If the alignment contains entries with the same sequence + # identifier (not a good idea - but seems possible), then this + # dictionary based parser will merge their sequences. Fix this? + ids = [] + seqs = [] + consensus = "" + seq_cols = None # Used to extract the consensus + + # Use the first block to get the sequence identifiers + while True: + if line[0] != " " and line.strip() != "": + # Sequences identifier... + fields = line.rstrip().split() + + # We expect there to be two fields, there can be an optional + # "sequence number" field containing the letter count. + if len(fields) < 2 or len(fields) > 3: + raise ValueError("Could not parse line:\n%s" % line) + + ids.append(fields[0]) + seqs.append(fields[1]) + + # Record the sequence position to get the consensus + if seq_cols is None: + start = len(fields[0]) + line[len(fields[0]) :].find(fields[1]) + end = start + len(fields[1]) + seq_cols = slice(start, end) + del start, end + assert fields[1] == line[seq_cols] + + if len(fields) == 3: + # This MAY be an old style file with a letter count... + try: + letters = int(fields[2]) + except ValueError: + raise ValueError( + "Could not parse line, bad sequence number:\n%s" % line + ) from None + if len(fields[1].replace("-", "")) != letters: + raise ValueError( + "Could not parse line, invalid sequence number:\n%s" % line + ) + elif line[0] == " ": + # Sequence consensus line... + assert len(ids) == len(seqs) + assert len(ids) > 0 + assert seq_cols is not None + consensus = line[seq_cols] + assert not line[: seq_cols.start].strip() + assert not line[seq_cols.stop :].strip() + # Check for blank line (or end of file) + line = handle.readline() + assert line.strip() == "" + break + else: + # No consensus + break + line = handle.readline() + if not line: + break # end of file + + assert line.strip() == "" + assert seq_cols is not None + + # Confirm all same length + for s in seqs: + assert len(s) == len(seqs[0]) + if consensus: + assert len(consensus) == len(seqs[0]) + + # Loop over any remaining blocks... + done = False + while not done: + # There should be a blank line between each block. + # Also want to ignore any consensus line from the + # previous block. + while (not line) or line.strip() == "": + line = handle.readline() + if not line: + break # end of file + if not line: + break # end of file + + if line.split(None, 1)[0] in known_headers: + # Found concatenated alignment. + self._header = line + break + + for i in range(len(ids)): + if line[0] == " ": + raise ValueError("Unexpected line:\n%r" % line) + fields = line.rstrip().split() + + # We expect there to be two fields, there can be an optional + # "sequence number" field containing the letter count. + if len(fields) < 2 or len(fields) > 3: + raise ValueError("Could not parse line:\n%r" % line) + + if fields[0] != ids[i]: + raise ValueError( + "Identifiers out of order? Got '%s' but expected '%s'" + % (fields[0], ids[i]) + ) + + if fields[1] != line[seq_cols]: + start = len(fields[0]) + line[len(fields[0]) :].find(fields[1]) + if start != seq_cols.start: + raise ValueError("Old location %s -> %i:XX" % (seq_cols, start)) + end = start + len(fields[1]) + seq_cols = slice(start, end) + del start, end + + # Append the sequence + seqs[i] += fields[1] + assert len(seqs[i]) == len(seqs[0]) + + if len(fields) == 3: + # This MAY be an old style file with a letter count... + try: + letters = int(fields[2]) + except ValueError: + raise ValueError( + "Could not parse line, bad sequence number:\n%s" % line + ) from None + if len(seqs[i].replace("-", "")) != letters: + raise ValueError( + "Could not parse line, invalid sequence number:\n%s" % line + ) + + # Read in the next line + line = handle.readline() + # There should now be a consensus line + if consensus: + assert line[0] == " " + assert seq_cols is not None + consensus += line[seq_cols] + assert len(consensus) == len(seqs[0]) + assert not line[: seq_cols.start].strip() + assert not line[seq_cols.stop :].strip() + # Read in the next line + line = handle.readline() + + assert len(ids) == len(seqs) + if len(seqs) == 0 or len(seqs[0]) == 0: + raise StopIteration + + if ( + self.records_per_alignment is not None + and self.records_per_alignment != len(ids) + ): + raise ValueError( + "Found %i records in this alignment, told to expect %i" + % (len(ids), self.records_per_alignment) + ) + + records = (SeqRecord(Seq(s), id=i, description=i) for (i, s) in zip(ids, seqs)) + alignment = MultipleSeqAlignment(records) + # TODO - Handle alignment annotation better, for now + # mimic the old parser in Bio.Clustalw + if version: + alignment._version = version + if consensus: + alignment_length = len(seqs[0]) + if len(consensus) != alignment_length: + raise ValueError( + "Alignment length is %i, consensus length is %i, '%s'" + % (alignment_length, len(consensus), consensus) + ) + alignment.column_annotations["clustal_consensus"] = consensus + # For backward compatibility prior to .column_annotations: + alignment._star_info = consensus + return alignment diff --git a/code/lib/Bio/AlignIO/EmbossIO.py b/code/lib/Bio/AlignIO/EmbossIO.py new file mode 100644 index 0000000..b1ebd4d --- /dev/null +++ b/code/lib/Bio/AlignIO/EmbossIO.py @@ -0,0 +1,219 @@ +# Copyright 2008-2016 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.AlignIO support for "emboss" alignment output from EMBOSS tools. + +You are expected to use this module via the Bio.AlignIO functions (or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). + +This module contains a parser for the EMBOSS pairs/simple file format, for +example from the alignret, water and needle tools. +""" +from Bio.Align import MultipleSeqAlignment +from Bio.AlignIO.Interfaces import AlignmentIterator +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + + +class EmbossIterator(AlignmentIterator): + """Emboss alignment iterator. + + For reading the (pairwise) alignments from EMBOSS tools in what they + call the "pairs" and "simple" formats. + """ + + _header = None # for caching lines between __next__ calls + + def __next__(self): + """Parse the next alignment from the handle.""" + handle = self.handle + + if self._header is None: + line = handle.readline() + else: + # Header we saved from when we were parsing + # the previous alignment. + line = self._header + self._header = None + + if not line: + raise StopIteration + + while line.rstrip() != "#=======================================": + line = handle.readline() + if not line: + raise StopIteration + + length_of_seqs = None + number_of_seqs = None + ids = [] + header_dict = {} + + while line[0] == "#": + # Read in the rest of this alignment header, + # try and discover the number of records expected + # and their length + parts = line[1:].split(":", 1) + key = parts[0].lower().strip() + if key == "aligned_sequences": + number_of_seqs = int(parts[1].strip()) + assert len(ids) == 0 + # Should now expect the record identifiers... + for i in range(number_of_seqs): + line = handle.readline() + parts = line[1:].strip().split(":", 1) + assert i + 1 == int(parts[0].strip()) + ids.append(parts[1].strip()) + assert len(ids) == number_of_seqs + if key == "length": + length_of_seqs = int(parts[1].strip()) + + # Parse the rest of the header + if key == "identity": + header_dict["identity"] = int(parts[1].strip().split("/")[0]) + if key == "similarity": + header_dict["similarity"] = int(parts[1].strip().split("/")[0]) + if key == "gaps": + header_dict["gaps"] = int(parts[1].strip().split("/")[0]) + if key == "score": + header_dict["score"] = float(parts[1].strip()) + + # And read in another line... + line = handle.readline() + + if number_of_seqs is None: + raise ValueError("Number of sequences missing!") + if length_of_seqs is None: + raise ValueError("Length of sequences missing!") + + if ( + self.records_per_alignment is not None + and self.records_per_alignment != number_of_seqs + ): + raise ValueError( + "Found %i records in this alignment, told to expect %i" + % (number_of_seqs, self.records_per_alignment) + ) + + seqs = [""] * len(ids) + seq_starts = [] + index = 0 + + # Parse the seqs + while line: + if len(line) > 21: + id_start = line[:21].strip().split(None, 1) + seq_end = line[21:].strip().split(None, 1) + if len(id_start) == 2 and len(seq_end) == 2: + # identifier, seq start position, seq, seq end position + # (an aligned seq is broken up into multiple lines) + id, start = id_start + seq, end = seq_end + if start >= end: + # Special case, either a single letter is present, + # or no letters at all. + if seq.replace("-", "") == "": + start = int(start) + end = int(end) + else: + start = int(start) - 1 + end = int(end) + else: + assert seq.replace("-", "") != "", repr(line) + start = int(start) - 1 # python counting + end = int(end) + + if index < 0 or index >= number_of_seqs: + raise ValueError( + "Expected index %i in range [0,%i)" + % (index, number_of_seqs) + ) + # The identifier is truncated... + assert id == ids[index] or id == ids[index][: len(id)] + + if len(seq_starts) == index: + # Record the start + seq_starts.append(start) + + # Check the start... + if start >= end: + assert seq.replace("-", "") == "", line + elif start - seq_starts[index] != len(seqs[index].replace("-", "")): + raise ValueError( + "Found %i chars so far for sequence %i (%s, %r), line says start %i:\n%s" + % ( + len(seqs[index].replace("-", "")), + index, + id, + seqs[index], + start, + line, + ) + ) + seqs[index] += seq + + # Check the end ... + if end != seq_starts[index] + len(seqs[index].replace("-", "")): + raise ValueError( + "Found %i chars so far for sequence %i (%s, %r, start=%i), file says end %i:\n%s" + % ( + len(seqs[index].replace("-", "")), + index, + id, + seqs[index], + seq_starts[index], + end, + line, + ) + ) + + index += 1 + if index >= number_of_seqs: + index = 0 + else: + # just a start value, this is just alignment annotation (?) + # print("Skipping: " + line.rstrip()) + pass + elif line.strip() == "": + # Just a spacer? + pass + else: + raise ValueError("Unrecognised EMBOSS pairwise line: %r\n" % line) + + line = handle.readline() + if ( + line.rstrip() == "#---------------------------------------" + or line.rstrip() == "#=======================================" + ): + # End of alignment + self._header = line + break + + assert index == 0 + + if ( + self.records_per_alignment is not None + and self.records_per_alignment != len(ids) + ): + raise ValueError( + "Found %i records in this alignment, told to expect %i" + % (len(ids), self.records_per_alignment) + ) + + records = [] + for id, seq in zip(ids, seqs): + if len(seq) != length_of_seqs: + # EMBOSS 2.9.0 is known to use spaces instead of minus signs + # for leading gaps, and thus fails to parse. This old version + # is still used as of Dec 2008 behind the EBI SOAP webservice: + # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl + raise ValueError( + "Error parsing alignment - sequences of " + "different length? You could be using an " + "old version of EMBOSS." + ) + records.append(SeqRecord(Seq(seq), id=id, description=id)) + return MultipleSeqAlignment(records, annotations=header_dict) diff --git a/code/lib/Bio/AlignIO/FastaIO.py b/code/lib/Bio/AlignIO/FastaIO.py new file mode 100644 index 0000000..9816253 --- /dev/null +++ b/code/lib/Bio/AlignIO/FastaIO.py @@ -0,0 +1,344 @@ +# Copyright 2008-2016 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. + +You are expected to use this module via the Bio.AlignIO functions (or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). + +This module contains a parser for the pairwise alignments produced by Bill +Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is +referred to as the "fasta-m10" file format (as we only support the machine +readable output format selected with the -m 10 command line option). + +This module does NOT cover the generic "fasta" file format originally +developed as an input format to the FASTA tools. The Bio.AlignIO and +Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, +which can also be used to store a multiple sequence alignments. +""" +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + + +def _extract_alignment_region(alignment_seq_with_flanking, annotation): + """Extract alignment region (PRIVATE). + + Helper function for the main parsing code. + + To get the actual pairwise alignment sequences, we must first + translate the un-gapped sequence based coordinates into positions + in the gapped sequence (which may have a flanking region shown + using leading - characters). To date, I have never seen any + trailing flanking region shown in the m10 file, but the + following code should also cope with that. + + Note that this code seems to work fine even when the "sq_offset" + entries are present as a result of using the -X command line option. + """ + align_stripped = alignment_seq_with_flanking.strip("-") + display_start = int(annotation["al_display_start"]) + if int(annotation["al_start"]) <= int(annotation["al_stop"]): + start = int(annotation["al_start"]) - display_start + end = int(annotation["al_stop"]) - display_start + 1 + else: + # FASTA has flipped this sequence... + start = display_start - int(annotation["al_start"]) + end = display_start - int(annotation["al_stop"]) + 1 + + end += align_stripped.count("-") + if start < 0 or start >= end or end > len(align_stripped): + raise ValueError( + "Problem with sequence start/stop,\n%s[%i:%i]\n%s" + % (alignment_seq_with_flanking, start, end, annotation) + ) + return align_stripped[start:end] + + +def FastaM10Iterator(handle, seq_count=None): + """Alignment iterator for the FASTA tool's pairwise alignment output. + + This is for reading the pairwise alignments output by Bill Pearson's + FASTA program when called with the -m 10 command line option for machine + readable output. For more details about the FASTA tools, see the website + http://fasta.bioch.virginia.edu/ and the paper: + + W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 + + This class is intended to be used via the Bio.AlignIO.parse() function + by specifying the format as "fasta-m10" as shown in the following code:: + + from Bio import AlignIO + handle = ... + for a in AlignIO.parse(handle, "fasta-m10"): + assert len(a) == 2, "Should be pairwise!" + print("Alignment length %i" % a.get_alignment_length()) + for record in a: + print("%s %s %s" % (record.seq, record.name, record.id)) + + Note that this is not a full blown parser for all the information + in the FASTA output - for example, most of the header and all of the + footer is ignored. Also, the alignments are not batched according to + the input queries. + + Also note that there can be up to about 30 letters of flanking region + included in the raw FASTA output as contextual information. This is NOT + part of the alignment itself, and is not included in the resulting + MultipleSeqAlignment objects returned. + """ + state_PREAMBLE = -1 + state_NONE = 0 + state_QUERY_HEADER = 1 + state_ALIGN_HEADER = 2 + state_ALIGN_QUERY = 3 + state_ALIGN_MATCH = 4 + state_ALIGN_CONS = 5 + + def build_hsp(): + if not query_tags and not match_tags: + raise ValueError("No data for query %r, match %r" % (query_id, match_id)) + assert query_tags, query_tags + assert match_tags, match_tags + evalue = align_tags.get("fa_expect") + tool = global_tags.get("tool", "").upper() + + q = _extract_alignment_region(query_seq, query_tags) + if tool in ["TFASTX"] and len(match_seq) == len(q): + m = match_seq + # Quick hack until I can work out how -, * and / characters + # and the apparent mix of aa and bp coordinates works. + else: + m = _extract_alignment_region(match_seq, match_tags) + if len(q) != len(m): + raise ValueError( + f"""\ +Darn... amino acids vs nucleotide coordinates? +tool: {tool} +query_seq: {query_seq} +query_tags: {query_tags} +{q} length: {len(q)} +match_seq: {match_seq} +match_tags: {match_tags} +{m} length: {len(m)} +handle.name: {handle.name} +""" + ) + + annotations = {} + records = [] + + # Want to record both the query header tags, and the alignment tags. + annotations.update(header_tags) + annotations.update(align_tags) + + # Query + # ===== + record = SeqRecord( + Seq(q), + id=query_id, + name="query", + description=query_descr, + annotations={"original_length": int(query_tags["sq_len"])}, + ) + # TODO - handle start/end coordinates properly. Short term hack for now: + record._al_start = int(query_tags["al_start"]) + record._al_stop = int(query_tags["al_stop"]) + + # TODO - Can FASTA output RNA? + if "sq_type" in query_tags: + if query_tags["sq_type"] == "D": + record.annotations["molecule_type"] = "DNA" + elif query_tags["sq_type"] == "p": + record.annotations["molecule_type"] = "protein" + + records.append(record) + + # Match + # ===== + record = SeqRecord( + Seq(m), + id=match_id, + name="match", + description=match_descr, + annotations={"original_length": int(match_tags["sq_len"])}, + ) + # TODO - handle start/end coordinates properly. Short term hack for now: + record._al_start = int(match_tags["al_start"]) + record._al_stop = int(match_tags["al_stop"]) + + if "sq_type" in match_tags: + if match_tags["sq_type"] == "D": + record.annotations["molecule_type"] = "DNA" + elif match_tags["sq_type"] == "p": + record.annotations["molecule_type"] = "protein" + + records.append(record) + + return MultipleSeqAlignment(records, annotations=annotations) + + state = state_PREAMBLE + query_id = None + match_id = None + query_descr = "" + match_descr = "" + global_tags = {} + header_tags = {} + align_tags = {} + query_tags = {} + match_tags = {} + query_seq = "" + match_seq = "" + cons_seq = "" + for line in handle: + if ">>>" in line and not line.startswith(">>>"): + if query_id and match_id: + # This happens on old FASTA output which lacked an end of + # query >>><<< marker line. + yield build_hsp() + state = state_NONE + query_descr = line[line.find(">>>") + 3 :].strip() + query_id = query_descr.split(None, 1)[0] + match_id = None + header_tags = {} + align_tags = {} + query_tags = {} + match_tags = {} + query_seq = "" + match_seq = "" + cons_seq = "" + elif line.startswith("!! No "): + # e.g. + # !! No library sequences with E() < 0.5 + # or on more recent versions, + # No sequences with E() < 0.05 + assert state == state_NONE + assert not header_tags + assert not align_tags + assert not match_tags + assert not query_tags + assert match_id is None + assert not query_seq + assert not match_seq + assert not cons_seq + query_id = None + elif line.strip() in [">>><<<", ">>>///"]: + # End of query, possible end of all queries + if query_id and match_id: + yield build_hsp() + state = state_NONE + query_id = None + match_id = None + header_tags = {} + align_tags = {} + query_tags = {} + match_tags = {} + query_seq = "" + match_seq = "" + cons_seq = "" + elif line.startswith(">>>"): + # Should be start of a match! + assert query_id is not None + assert line[3:].split(", ", 1)[0] == query_id, line + assert match_id is None + assert not header_tags + assert not align_tags + assert not query_tags + assert not match_tags + assert not match_seq + assert not query_seq + assert not cons_seq + state = state_QUERY_HEADER + elif line.startswith(">>"): + # Should now be at start of a match alignment! + if query_id and match_id: + yield build_hsp() + align_tags = {} + query_tags = {} + match_tags = {} + query_seq = "" + match_seq = "" + cons_seq = "" + match_descr = line[2:].strip() + match_id = match_descr.split(None, 1)[0] + state = state_ALIGN_HEADER + elif line.startswith(">--"): + # End of one HSP + assert query_id and match_id, line + yield build_hsp() + # Clean up read for next HSP + # but reuse header_tags + align_tags = {} + query_tags = {} + match_tags = {} + query_seq = "" + match_seq = "" + cons_seq = "" + state = state_ALIGN_HEADER + elif line.startswith(">"): + if state == state_ALIGN_HEADER: + # Should be start of query alignment seq... + assert query_id is not None, line + assert match_id is not None, line + assert query_id.startswith(line[1:].split(None, 1)[0]), line + state = state_ALIGN_QUERY + elif state == state_ALIGN_QUERY: + # Should be start of match alignment seq + assert query_id is not None, line + assert match_id is not None, line + assert match_id.startswith(line[1:].split(None, 1)[0]), line + state = state_ALIGN_MATCH + elif state == state_NONE: + # Can get > as the last line of a histogram + pass + else: + raise RuntimeError("state %i got %r" % (state, line)) + elif line.startswith("; al_cons"): + assert state == state_ALIGN_MATCH, line + state = state_ALIGN_CONS + # Next line(s) should be consensus seq... + elif line.startswith("; "): + if ": " in line: + key, value = [s.strip() for s in line[2:].split(": ", 1)] + else: + import warnings + from Bio import BiopythonParserWarning + + # Seen in lalign36, specifically version 36.3.4 Apr, 2011 + # Fixed in version 36.3.5b Oct, 2011(preload8) + warnings.warn( + "Missing colon in line: %r" % line, BiopythonParserWarning + ) + try: + key, value = [s.strip() for s in line[2:].split(" ", 1)] + except ValueError: + raise ValueError("Bad line: %r" % line) from None + if state == state_QUERY_HEADER: + header_tags[key] = value + elif state == state_ALIGN_HEADER: + align_tags[key] = value + elif state == state_ALIGN_QUERY: + query_tags[key] = value + elif state == state_ALIGN_MATCH: + match_tags[key] = value + else: + raise RuntimeError("Unexpected state %r, %r" % (state, line)) + elif state == state_ALIGN_QUERY: + query_seq += line.strip() + elif state == state_ALIGN_MATCH: + match_seq += line.strip() + elif state == state_ALIGN_CONS: + cons_seq += line.strip("\n") + elif state == state_PREAMBLE: + if line.startswith("#"): + global_tags["command"] = line[1:].strip() + elif line.startswith(" version "): + global_tags["version"] = line[9:].strip() + elif " compares a " in line: + global_tags["tool"] = line[: line.find(" compares a ")].strip() + elif " searches a " in line: + global_tags["tool"] = line[: line.find(" searches a ")].strip() + else: + pass diff --git a/code/lib/Bio/AlignIO/Interfaces.py b/code/lib/Bio/AlignIO/Interfaces.py new file mode 100644 index 0000000..b53de30 --- /dev/null +++ b/code/lib/Bio/AlignIO/Interfaces.py @@ -0,0 +1,160 @@ +# Copyright 2008-2018 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""AlignIO support module (not for general use). + +Unless you are writing a new parser or writer for Bio.AlignIO, you should not +use this module. It provides base classes to try and simplify things. +""" + + +class AlignmentIterator: + """Base class for building MultipleSeqAlignment iterators. + + You should write a next() method to return Alignment + objects. You may wish to redefine the __init__ + method as well. + """ + + def __init__(self, handle, seq_count=None): + """Create an AlignmentIterator object. + + Arguments: + - handle - input file + - count - optional, expected number of records per alignment + Recommend for fasta file format. + + Note when subclassing: + - there should be a single non-optional argument, the handle, + and optional count IN THAT ORDER. + - you can add additional optional arguments. + + """ + self.handle = handle + self.records_per_alignment = seq_count + ##################################################### + # You may want to subclass this, for example # + # to read through the file to find the first record,# + # or if additional arguments are required. # + ##################################################### + + def __next__(self): + """Return the next alignment in the file. + + This method should be replaced by any derived class to do something + useful. + """ + raise NotImplementedError("This object should be subclassed") + ##################################################### + # You SHOULD subclass this, to split the file up # + # into your individual alignments and convert these # + # into MultipleSeqAlignment objects. # + ##################################################### + + def __iter__(self): + """Iterate over the entries as MultipleSeqAlignment objects. + + Example usage for (concatenated) PHYLIP files:: + + with open("many.phy","r") as myFile: + for alignment in PhylipIterator(myFile): + print("New alignment:") + for record in alignment: + print(record.id) + print(record.seq) + + """ + return iter(self.__next__, None) + + +class AlignmentWriter: + """Base class for building MultipleSeqAlignment writers. + + You should write a write_alignment() method. + You may wish to redefine the __init__ method as well. + """ + + def __init__(self, handle): + """Initialize the class.""" + self.handle = handle + + def write_file(self, alignments): + """Use this to write an entire file containing the given alignments. + + Arguments: + - alignments - A list or iterator returning MultipleSeqAlignment objects + + In general, this method can only be called once per file. + + This method should be replaced by any derived class to do something + useful. It should return the number of alignments.. + """ + raise NotImplementedError("This object should be subclassed") + ##################################################### + # You SHOULD subclass this, to write the alignment # + # objects to the file handle # + ##################################################### + + def clean(self, text): + """Use this to avoid getting newlines in the output.""" + return text.replace("\n", " ").replace("\r", " ") + + +class SequentialAlignmentWriter(AlignmentWriter): + """Base class for building MultipleSeqAlignment writers. + + This assumes each alignment can be simply appended to the file. + You should write a write_alignment() method. + You may wish to redefine the __init__ method as well. + """ + + def __init__(self, handle): + """Initialize the class.""" + self.handle = handle + + def write_file(self, alignments): + """Use this to write an entire file containing the given alignments. + + Arguments: + - alignments - A list or iterator returning MultipleSeqAlignment objects + + In general, this method can only be called once per file. + """ + self.write_header() + count = 0 + for alignment in alignments: + self.write_alignment(alignment) + count += 1 + self.write_footer() + return count + + def write_header(self): + """Use this to write any header. + + This method should be replaced by any derived class to do something + useful. + """ + pass + + def write_footer(self): + """Use this to write any footer. + + This method should be replaced by any derived class to do something + useful. + """ + pass + + def write_alignment(self, alignment): + """Use this to write a single alignment. + + This method should be replaced by any derived class to do something + useful. + """ + raise NotImplementedError("This object should be subclassed") + ##################################################### + # You SHOULD subclass this, to write the alignment # + # objects to the file handle # + ##################################################### diff --git a/code/lib/Bio/AlignIO/MafIO.py b/code/lib/Bio/AlignIO/MafIO.py new file mode 100644 index 0000000..787325e --- /dev/null +++ b/code/lib/Bio/AlignIO/MafIO.py @@ -0,0 +1,833 @@ +# Copyright 2011, 2012 by Andrew Sczesnak. All rights reserved. +# Revisions Copyright 2011, 2017 by Peter Cock. All rights reserved. +# Revisions Copyright 2014, 2015 by Adam Novak. All rights reserved. +# Revisions Copyright 2015, 2017 by Blaise Li. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.AlignIO support for the "maf" multiple alignment format. + +The Multiple Alignment Format, described by UCSC, stores a series of +multiple alignments in a single file. It is suitable for whole-genome +to whole-genome alignments, metadata such as source chromosome, start +position, size, and strand can be stored. + +See http://genome.ucsc.edu/FAQ/FAQformat.html#format5 + +You are expected to use this module via the Bio.AlignIO functions(or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). + +Coordinates in the MAF format are defined in terms of zero-based start +positions (like Python) and aligning region sizes. + +A minimal aligned region of length one and starting at first position in the +source sequence would have ``start == 0`` and ``size == 1``. + +As we can see on this example, ``start + size`` will give one more than the +zero-based end position. We can therefore manipulate ``start`` and +``start + size`` as python list slice boundaries. + +For an inclusive end coordinate, we need to use ``end = start + size - 1``. +A 1-column wide alignment would have ``start == end``. +""" +import os + +from itertools import islice +from sqlite3 import dbapi2 + +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from .Interfaces import SequentialAlignmentWriter + +MAFINDEX_VERSION = 2 + + +class MafWriter(SequentialAlignmentWriter): + """Accepts a MultipleSeqAlignment object, writes a MAF file.""" + + def write_header(self): + """Write the MAF header.""" + self.handle.write("##maf version=1 scoring=none\n") + self.handle.write("# generated by Biopython\n\n") + + def _write_record(self, record): + """Write a single SeqRecord object to an 's' line in a MAF block (PRIVATE).""" + # convert biopython-style 1/-1 strand to MAF-style +/- strand + if record.annotations.get("strand") == 1: + strand = "+" + elif record.annotations.get("strand") == -1: + strand = "-" + else: + # TODO: issue warning? + strand = "+" + + fields = [ + "s", + # In the MAF file format, spaces are not allowed in the id + "%-40s" % record.id.replace(" ", "_"), + "%15s" % record.annotations.get("start", 0), + "%5s" + % record.annotations.get("size", len(str(record.seq).replace("-", ""))), + strand, + "%15s" % record.annotations.get("srcSize", 0), + str(record.seq), + ] + self.handle.write("%s\n" % " ".join(fields)) + + def write_alignment(self, alignment): + """Write a complete alignment to a MAF block. + + Writes every SeqRecord in a MultipleSeqAlignment object to its own + MAF block (beginning with an 'a' line, containing 's' lines). + """ + if not isinstance(alignment, MultipleSeqAlignment): + raise TypeError("Expected an alignment object") + + if len({len(x) for x in alignment}) > 1: + raise ValueError("Sequences must all be the same length") + + # We allow multiple sequences with the same IDs; for example, there may + # be a MAF aligning the + and - strands of the same sequence together. + + # for now, use ._annotations private property, but restrict keys to those + # specifically supported by the MAF format, according to spec + try: + anno = " ".join( + [ + "%s=%s" % (x, y) + for x, y in alignment._annotations.items() + if x in ("score", "pass") + ] + ) + except AttributeError: + anno = "score=0.00" + + self.handle.write("a %s\n" % (anno,)) + + recs_out = 0 + + for record in alignment: + self._write_record(record) + + recs_out += 1 + + self.handle.write("\n") + + return recs_out + + +# Invalid function name according to pylint, but kept for compatibility +# with Bio* conventions. +def MafIterator(handle, seq_count=None): + """Iterate over a MAF file handle as MultipleSeqAlignment objects. + + Iterates over lines in a MAF file-like object (handle), yielding + MultipleSeqAlignment objects. SeqRecord IDs generally correspond to + species names. + """ + in_a_bundle = False + + annotations = [] + records = [] + + while True: + # allows parsing of the last bundle without duplicating code + try: + line = next(handle) + except StopIteration: + line = "" + + if in_a_bundle: + if line.startswith("s"): + # add a SeqRecord to the bundle + line_split = line.strip().split() + + if len(line_split) != 7: + raise ValueError( + "Error parsing alignment - 's' line must have 7 fields" + ) + + # convert MAF-style +/- strand to biopython-type 1/-1 + if line_split[4] == "+": + strand = 1 + elif line_split[4] == "-": + strand = -1 + else: + # TODO: issue warning, set to 0? + strand = 1 + + # s (literal), src (ID), start, size, strand, srcSize, text (sequence) + anno = { + "start": int(line_split[2]), + "size": int(line_split[3]), + "strand": strand, + "srcSize": int(line_split[5]), + } + + sequence = line_split[6] + + # interpret a dot/period to mean the same as the first sequence + if "." in sequence: + if not records: + raise ValueError( + "Found dot/period in first sequence of alignment" + ) + + ref = records[0].seq + new = [] + + for (letter, ref_letter) in zip(sequence, ref): + new.append(ref_letter if letter == "." else letter) + + sequence = "".join(new) + + records.append( + SeqRecord( + Seq(sequence), + id=line_split[1], + name=line_split[1], + description="", + annotations=anno, + ) + ) + elif line.startswith("i"): + # TODO: information about what is in the aligned species DNA before + # and after the immediately preceding "s" line + pass + elif line.startswith("e"): + # TODO: information about the size of the gap between the alignments + # that span the current block + pass + elif line.startswith("q"): + # TODO: quality of each aligned base for the species. + # Need to find documentation on this, looks like ASCII 0-9 or gap? + # Can then store in each SeqRecord's .letter_annotations dictionary, + # perhaps as the raw string or turned into integers / None for gap? + pass + elif line.startswith("#"): + # ignore comments + # (not sure whether comments + # are in the maf specification, though) + pass + elif not line.strip(): + # end a bundle of records + if seq_count is not None: + assert len(records) == seq_count + + alignment = MultipleSeqAlignment(records) + # TODO - Introduce an annotated alignment class? + # See also Bio/AlignIO/FastaIO.py for same requirement. + # For now, store the annotation a new private property: + alignment._annotations = annotations + + yield alignment + + in_a_bundle = False + + annotations = [] + records = [] + else: + raise ValueError( + "Error parsing alignment - unexpected line:\n%s" % (line,) + ) + elif line.startswith("a"): + # start a bundle of records + in_a_bundle = True + annot_strings = line.strip().split()[1:] + if len(annot_strings) != line.count("="): + raise ValueError("Error parsing alignment - invalid key in 'a' line") + annotations = dict(a_string.split("=") for a_string in annot_strings) + elif line.startswith("#"): + # ignore comments + pass + elif not line: + break + + +class MafIndex: + """Index for a MAF file. + + The index is a sqlite3 database that is built upon creation of the object + if necessary, and queried when methods *search* or *get_spliced* are + used. + """ + + def __init__(self, sqlite_file, maf_file, target_seqname): + """Indexes or loads the index of a MAF file.""" + self._target_seqname = target_seqname + # example: Tests/MAF/ucsc_mm9_chr10.mafindex + self._index_filename = sqlite_file + # example: /home/bli/src/biopython/Tests/MAF + self._relative_path = os.path.abspath(os.path.dirname(sqlite_file)) + # example: Tests/MAF/ucsc_mm9_chr10.maf + self._maf_file = maf_file + + self._maf_fp = open(self._maf_file) + + # if sqlite_file exists, use the existing db, otherwise index the file + if os.path.isfile(sqlite_file): + self._con = dbapi2.connect(sqlite_file) + self._record_count = self.__check_existing_db() + else: + self._con = dbapi2.connect(sqlite_file) + self._record_count = self.__make_new_index() + + # lastly, setup a MafIterator pointing at the open maf_file + self._mafiter = MafIterator(self._maf_fp) + + def __check_existing_db(self): + """Perform basic sanity checks upon loading an existing index (PRIVATE).""" + try: + idx_version = int( + self._con.execute( + "SELECT value FROM meta_data WHERE key = 'version'" + ).fetchone()[0] + ) + if idx_version != MAFINDEX_VERSION: + msg = "\n".join( + [ + "Index version (%s) incompatible with this version " + "of MafIndex" % idx_version, + "You might erase the existing index %s " + "for it to be rebuilt." % self._index_filename, + ] + ) + raise ValueError(msg) + + filename = self._con.execute( + "SELECT value FROM meta_data WHERE key = 'filename'" + ).fetchone()[0] + # Compute absolute path of the original maf file + if os.path.isabs(filename): + # It was already stored as absolute + tmp_mafpath = filename + else: + # It should otherwise have been stored as relative to the index + # Would be stored with Unix / path separator, so convert + # it to the local OS path separator here: + tmp_mafpath = os.path.join( + self._relative_path, filename.replace("/", os.path.sep) + ) + if tmp_mafpath != os.path.abspath(self._maf_file): + # Original and given absolute paths differ. + raise ValueError( + "Index uses a different file (%s != %s)" + % (filename, self._maf_file) + ) + + db_target = self._con.execute( + "SELECT value FROM meta_data WHERE key = 'target_seqname'" + ).fetchone()[0] + if db_target != self._target_seqname: + raise ValueError( + "Provided database indexed for %s, expected %s" + % (db_target, self._target_seqname) + ) + + record_count = int( + self._con.execute( + "SELECT value FROM meta_data WHERE key = 'record_count'" + ).fetchone()[0] + ) + if record_count == -1: + raise ValueError("Unfinished/partial database provided") + + records_found = int( + self._con.execute("SELECT COUNT(*) FROM offset_data").fetchone()[0] + ) + if records_found != record_count: + raise ValueError( + "Expected %s records, found %s. Corrupt index?" + % (record_count, records_found) + ) + + return records_found + + except (dbapi2.OperationalError, dbapi2.DatabaseError) as err: + raise ValueError("Problem with SQLite database: %s" % err) from None + + def __make_new_index(self): + """Read MAF file and generate SQLite index (PRIVATE).""" + # make the tables + self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") + self._con.execute( + "INSERT INTO meta_data (key, value) VALUES ('version', %s);" + % MAFINDEX_VERSION + ) + self._con.execute( + "INSERT INTO meta_data (key, value) VALUES ('record_count', -1);" + ) + self._con.execute( + "INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" + % (self._target_seqname,) + ) + # Determine whether to store maf file as relative to the index or absolute + # See https://github.com/biopython/biopython/pull/381 + if not os.path.isabs(self._maf_file) and not os.path.isabs( + self._index_filename + ): + # Since the user gave both maf file and index as relative paths, + # we will store the maf file relative to the index. + # Note for cross platform use (e.g. shared drive over SAMBA), + # convert any Windows slash into Unix style for rel paths. + # example: ucsc_mm9_chr10.maf + mafpath = os.path.relpath(self._maf_file, self._relative_path).replace( + os.path.sep, "/" + ) + elif ( + os.path.dirname(os.path.abspath(self._maf_file)) + os.path.sep + ).startswith(self._relative_path + os.path.sep): + # Since maf file is in same directory or sub directory, + # might as well make this into a relative path: + mafpath = os.path.relpath(self._maf_file, self._relative_path).replace( + os.path.sep, "/" + ) + else: + # Default to storing as an absolute path + # example: /home/bli/src/biopython/Tests/MAF/ucsc_mm9_chr10.maf + mafpath = os.path.abspath(self._maf_file) + self._con.execute( + "INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % (mafpath,) + ) + self._con.execute( + "CREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);" + ) + + insert_count = 0 + + # iterate over the entire file and insert in batches + mafindex_func = self.__maf_indexer() + + while True: + batch = list(islice(mafindex_func, 100)) + if not batch: + break + + # batch is made from self.__maf_indexer(), + # which yields zero-based "inclusive" start and end coordinates + self._con.executemany( + "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", + batch, + ) + self._con.commit() + insert_count += len(batch) + + # then make indexes on the relevant fields + self._con.execute("CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);") + self._con.execute( + "CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);" + ) + self._con.execute("CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);") + + self._con.execute( + "UPDATE meta_data SET value = '%s' WHERE key = 'record_count'" + % (insert_count,) + ) + + self._con.commit() + + return insert_count + + def __maf_indexer(self): + """Return index information for each bundle (PRIVATE). + + Yields index information for each bundle in the form of + (bin, start, end, offset) tuples where start and end are + 0-based inclusive coordinates. + """ + line = self._maf_fp.readline() + + while line: + if line.startswith("a"): + # note the offset + offset = self._maf_fp.tell() - len(line) + + # search the following lines for a match to target_seqname + while True: + line = self._maf_fp.readline() + + if not line.strip() or line.startswith("a"): + # Empty line or new alignment record + raise ValueError( + "Target for indexing (%s) not found in this bundle" + % (self._target_seqname,) + ) + elif line.startswith("s"): + # s (literal), src (ID), start, size, strand, srcSize, text (sequence) + line_split = line.strip().split() + + if line_split[1] == self._target_seqname: + start = int(line_split[2]) + size = int(line_split[3]) + if size != len(line_split[6].replace("-", "")): + raise ValueError( + "Invalid length for target coordinates " + "(expected %s, found %s)" + % (size, len(line_split[6].replace("-", ""))) + ) + + # "inclusive" end position is start + length - 1 + end = start + size - 1 + + # _ucscbin takes end-exclusive coordinates + yield (self._ucscbin(start, end + 1), start, end, offset) + + break + + line = self._maf_fp.readline() + + # TODO: check coordinate correctness for the two bin-related static methods + @staticmethod + def _region2bin(start, end): + """Find bins that a region may belong to (PRIVATE). + + Converts a region to a list of bins that it may belong to, including largest + and smallest bins. + """ + bins = [0, 1] + + bins.extend(range(1 + (start >> 26), 2 + ((end - 1) >> 26))) + bins.extend(range(9 + (start >> 23), 10 + ((end - 1) >> 23))) + bins.extend(range(73 + (start >> 20), 74 + ((end - 1) >> 20))) + bins.extend(range(585 + (start >> 17), 586 + ((end - 1) >> 17))) + + return set(bins) + + @staticmethod + def _ucscbin(start, end): + """Return the smallest bin a given region will fit into (PRIVATE). + + Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system + """ + bin_offsets = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0] + + _bin_first_shift = 17 + _bin_next_shift = 3 + + start_bin = start + end_bin = end - 1 + + start_bin >>= _bin_first_shift + end_bin >>= _bin_first_shift + + for bin_offset in bin_offsets: + if start_bin == end_bin: + return bin_offset + start_bin + start_bin >>= _bin_next_shift + end_bin >>= _bin_next_shift + + return 0 + + def _get_record(self, offset): + """Retrieve a single MAF record located at the offset provided (PRIVATE).""" + self._maf_fp.seek(offset) + return next(self._mafiter) + + def search(self, starts, ends): + """Search index database for MAF records overlapping ranges provided. + + Returns *MultipleSeqAlignment* results in order by start, then end, then + internal offset field. + + *starts* should be a list of 0-based start coordinates of segments in the reference. + *ends* should be the list of the corresponding segment ends + (in the half-open UCSC convention: + http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). + """ + # verify the provided exon coordinates + if len(starts) != len(ends): + raise ValueError("Every position in starts must have a match in ends") + + # Could it be safer to sort the (exonstart, exonend) pairs? + for exonstart, exonend in zip(starts, ends): + exonlen = exonend - exonstart + if exonlen < 1: + raise ValueError( + "Exon coordinates (%d, %d) invalid: exon length (%d) < 1" + % (exonstart, exonend, exonlen) + ) + con = self._con + + # Keep track of what blocks have already been yielded + # in order to avoid duplicating them + # (see https://github.com/biopython/biopython/issues/1083) + yielded_rec_coords = set() + # search for every exon + for exonstart, exonend in zip(starts, ends): + try: + possible_bins = ", ".join( + map(str, self._region2bin(exonstart, exonend)) + ) + except TypeError: + raise TypeError( + "Exon coordinates must be integers " + "(start=%d, end=%d)" % (exonstart, exonend) + ) from None + + # https://www.sqlite.org/lang_expr.html + # ----- + # The BETWEEN operator + # + # The BETWEEN operator is logically equivalent to a pair of + # comparisons. "x BETWEEN y AND z" is equivalent to "x>=y AND x<=z" + # except that with BETWEEN, the x expression is only evaluated + # once. The precedence of the BETWEEN operator is the same as the + # precedence as operators == and != and LIKE and groups left to + # right. + # ----- + + # We are testing overlap between the query segment and records in + # the index, using non-strict coordinates comparisons. + # The query segment end must be passed as end-inclusive + # The index should also have been build with end-inclusive + # end coordinates. + # See https://github.com/biopython/biopython/pull/1086#issuecomment-285069073 + + result = con.execute( + "SELECT DISTINCT start, end, offset FROM offset_data " + "WHERE bin IN (%s) " + "AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) " + "ORDER BY start, end, offset ASC;" + % (possible_bins, exonstart, exonend - 1, exonend - 1) + ) + + rows = result.fetchall() + + # rows come from the sqlite index, + # which should have been written using __make_new_index, + # so rec_start and rec_end should be zero-based "inclusive" coordinates + for rec_start, rec_end, offset in rows: + # Avoid yielding multiple time the same block + if (rec_start, rec_end) in yielded_rec_coords: + continue + else: + yielded_rec_coords.add((rec_start, rec_end)) + # Iterate through hits, fetching alignments from the MAF file + # and checking to be sure we've retrieved the expected record. + + fetched = self._get_record(int(offset)) + + for record in fetched: + if record.id == self._target_seqname: + # start and size come from the maf lines + start = record.annotations["start"] + # "inclusive" end is start + length - 1 + end = start + record.annotations["size"] - 1 + + if not (start == rec_start and end == rec_end): + raise ValueError( + "Expected %s-%s @ offset %s, found %s-%s" + % (rec_start, rec_end, offset, start, end) + ) + + yield fetched + + def get_spliced(self, starts, ends, strand=1): + """Return a multiple alignment of the exact sequence range provided. + + Accepts two lists of start and end positions on target_seqname, representing + exons to be spliced in silico. Returns a *MultipleSeqAlignment* of the + desired sequences spliced together. + + *starts* should be a list of 0-based start coordinates of segments in the reference. + *ends* should be the list of the corresponding segment ends + (in the half-open UCSC convention: + http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). + + To ask for the alignment portion corresponding to the first 100 + nucleotides of the reference sequence, you would use + ``search([0], [100])`` + """ + # validate strand + if strand not in (1, -1): + raise ValueError("Strand must be 1 or -1, got %s" % strand) + + # pull all alignments that span the desired intervals + fetched = list(self.search(starts, ends)) + + # keep track of the expected letter count + # (sum of lengths of [start, end) segments, + # where [start, end) half-open) + expected_letters = sum(end - start for start, end in zip(starts, ends)) + + # if there's no alignment, return filler for the assembly of the length given + if len(fetched) == 0: + return MultipleSeqAlignment( + [SeqRecord(Seq("N" * expected_letters), id=self._target_seqname)] + ) + + # find the union of all IDs in these alignments + all_seqnames = {sequence.id for multiseq in fetched for sequence in multiseq} + + # split every record by base position + # key: sequence name + # value: dictionary + # key: position in the reference sequence + # value: letter(s) (including letters + # aligned to the "-" preceding the letter + # at the position in the reference, if any) + split_by_position = {seq_name: {} for seq_name in all_seqnames} + + # keep track of what the total number of (unspliced) letters should be + total_rec_length = 0 + + # track first strand encountered on the target seqname + ref_first_strand = None + + for multiseq in fetched: + # find the target_seqname in this MultipleSeqAlignment and use it to + # set the parameters for the rest of this iteration + for seqrec in multiseq: + if seqrec.id == self._target_seqname: + try: + if ref_first_strand is None: + ref_first_strand = seqrec.annotations["strand"] + + if ref_first_strand not in (1, -1): + raise ValueError("Strand must be 1 or -1") + elif ref_first_strand != seqrec.annotations["strand"]: + raise ValueError( + "Encountered strand='%s' on target seqname, " + "expected '%s'" + % (seqrec.annotations["strand"], ref_first_strand) + ) + except KeyError: + raise ValueError( + "No strand information for target seqname (%s)" + % self._target_seqname + ) from None + # length including gaps (i.e. alignment length) + rec_length = len(seqrec) + rec_start = seqrec.annotations["start"] + ungapped_length = seqrec.annotations["size"] + # inclusive end in zero-based coordinates of the reference + rec_end = rec_start + ungapped_length - 1 + # This is length in terms of actual letters in the reference + total_rec_length += ungapped_length + + # blank out these positions for every seqname + for seqrec in multiseq: + for pos in range(rec_start, rec_end + 1): + split_by_position[seqrec.id][pos] = "" + + break + # http://psung.blogspot.fr/2007/12/for-else-in-python.html + # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops + else: + raise ValueError( + "Did not find %s in alignment bundle" % (self._target_seqname,) + ) + + # the true, chromosome/contig/etc position in the target seqname + real_pos = rec_start + + # loop over the alignment to fill split_by_position + for gapped_pos in range(0, rec_length): + for seqrec in multiseq: + # keep track of this position's value for the target seqname + if seqrec.id == self._target_seqname: + track_val = seqrec.seq[gapped_pos] + + # Here, a real_pos that corresponds to just after a series of "-" + # in the reference will "accumulate" the letters found in other sequences + # in front of the "-"s + split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos] + + # increment the real_pos counter only when non-gaps are found in + # the target_seqname, and we haven't reached the end of the record + if track_val != "-" and real_pos < rec_end: + real_pos += 1 + + # make sure the number of bp entries equals the sum of the record lengths + if len(split_by_position[self._target_seqname]) != total_rec_length: + raise ValueError( + "Target seqname (%s) has %s records, expected %s" + % ( + self._target_seqname, + len(split_by_position[self._target_seqname]), + total_rec_length, + ) + ) + + # translates a position in the target_seqname sequence to its gapped length + realpos_to_len = { + pos: len(gapped_fragment) + for pos, gapped_fragment in split_by_position[self._target_seqname].items() + if len(gapped_fragment) > 1 + } + + # splice together the exons + subseq = {} + + for seqid in all_seqnames: + seq_split = split_by_position[seqid] + seq_splice = [] + + filler_char = "N" if seqid == self._target_seqname else "-" + + # iterate from start to end, taking bases from split_by_position when + # they exist, using N or - for gaps when there is no alignment. + append = seq_splice.append + + for exonstart, exonend in zip(starts, ends): + # exonend is exclusive + for real_pos in range(exonstart, exonend): + # if this seqname has this position, add it + if real_pos in seq_split: + append(seq_split[real_pos]) + # if not, but it's in the target_seqname, add length-matched filler + elif real_pos in realpos_to_len: + append(filler_char * realpos_to_len[real_pos]) + # it's not in either, so add a single filler character + else: + append(filler_char) + + subseq[seqid] = "".join(seq_splice) + + # make sure we're returning the right number of letters + if len(subseq[self._target_seqname].replace("-", "")) != expected_letters: + raise ValueError( + "Returning %s letters for target seqname (%s), expected %s" + % ( + len(subseq[self._target_seqname].replace("-", "")), + self._target_seqname, + expected_letters, + ) + ) + + # check to make sure all sequences are the same length as the target seqname + ref_subseq_len = len(subseq[self._target_seqname]) + + for seqid, seq in subseq.items(): + if len(seq) != ref_subseq_len: + raise ValueError( + "Returning length %s for %s, expected %s" + % (len(seq), seqid, ref_subseq_len) + ) + + # finally, build a MultipleSeqAlignment object for our final sequences + result_multiseq = [] + + for seqid, seq in subseq.items(): + seq = Seq(seq) + + seq = seq if strand == ref_first_strand else seq.reverse_complement() + + result_multiseq.append(SeqRecord(seq, id=seqid, name=seqid, description="")) + + return MultipleSeqAlignment(result_multiseq) + + def __repr__(self): + """Return a string representation of the index.""" + return "MafIO.MafIndex(%r, target_seqname=%r)" % ( + self._maf_fp.name, + self._target_seqname, + ) + + def __len__(self): + """Return the number of records in the index.""" + return self._record_count diff --git a/code/lib/Bio/AlignIO/MauveIO.py b/code/lib/Bio/AlignIO/MauveIO.py new file mode 100644 index 0000000..b5f597c --- /dev/null +++ b/code/lib/Bio/AlignIO/MauveIO.py @@ -0,0 +1,349 @@ +# Copyright 2015-2015 by Eric Rasche. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.AlignIO support for "xmfa" output from Mauve/ProgressiveMauve. + +You are expected to use this module via the Bio.AlignIO functions (or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). + +For example, consider a progressiveMauve alignment file containing the following:: + + #FormatVersion Mauve1 + #Sequence1File a.fa + #Sequence1Entry 1 + #Sequence1Format FastA + #Sequence2File b.fa + #Sequence2Entry 2 + #Sequence2Format FastA + #Sequence3File c.fa + #Sequence3Entry 3 + #Sequence3Format FastA + #BackboneFile three.xmfa.bbcols + > 1:0-0 + a.fa + -------------------------------------------------------------------------------- + -------------------------------------------------------------------------------- + -------------------------------------------------------------------------------- + > 2:5417-5968 + b.fa + TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACGTGAGAGGAGCGCCCTAAGCTTTGGGAAATTCAAGC- + -------------------------------------------------------------------------------- + CTGGAACGTACTTGCTGGTTTCGCTACTATTTCAAACAAGTTAGAGGCCGTTACCTCGGGCGAACGTATAAACCATTCTG + > 3:9476-10076 - c.fa + TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-GGGAGGAGATCGCCCCAAACGTATGGTGAGTCGGGCG + TTTCCTATAGCTATAGGACCAATCCACTTACCATACGCCCGGCGTCGCCCAGTCCGGTTCGGTACCCTCCATGACCCACG + ---------------------------------------------------------AAATGAGGGCCCAGGGTATGCTT + = + > 2:5969-6015 + b.fa + ----------------------- + GGGCGAACGTATAAACCATTCTG + > 3:9429-9476 - c.fa + TTCGGTACCCTCCATGACCCACG + AAATGAGGGCCCAGGGTATGCTT + +This is a multiple sequence alignment with multiple aligned sections, so you +would probably load this using the Bio.AlignIO.parse() function: + + >>> from Bio import AlignIO + >>> align = AlignIO.parse("Mauve/simple_short.xmfa", "mauve") + >>> alignments = list(align) + >>> for aln in alignments: + ... print(aln) + ... + Alignment with 3 rows and 240 columns + --------------------------------------------...--- a.fa + TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACG...CTG b.fa/5416-5968 + TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-G...CTT c.fa/9475-10076 + Alignment with 2 rows and 46 columns + -----------------------GGGCGAACGTATAAACCATTCTG b.fa/5968-6015 + TTCGGTACCCTCCATGACCCACGAAATGAGGGCCCAGGGTATGCTT c.fa/9428-9476 + +Additional information is extracted from the XMFA file and available through +the annotation attribute of each record:: + + >>> for record in alignments[0]: + ... print(record.id, len(record)) + ... print(" start: %d, end: %d, strand: %d" %( + ... record.annotations['start'], record.annotations['end'], + ... record.annotations['strand'])) + ... + a.fa 240 + start: 0, end: 0, strand: 1 + b.fa/5416-5968 240 + start: 5416, end: 5968, strand: 1 + c.fa/9475-10076 240 + start: 9475, end: 10076, strand: -1 + +""" +import re + +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from .Interfaces import AlignmentIterator +from .Interfaces import SequentialAlignmentWriter + + +XMFA_HEADER_REGEX = re.compile( + r"> (?P\d+):(?P\d+)-(?P\d+) (?P[+-]) (?P.*)" +) +XMFA_HEADER_REGEX_BIOPYTHON = re.compile( + r"> (?P\d+):(?P\d+)-(?P\d+) (?P[+-]) (?P[^#]*) # (?P.*)" +) +ID_LINE_FMT = "> {seq_name}:{start}-{end} {strand} {filename} # {ugly_hack}" + + +def _identifier_split(identifier): + """Return (name, start, end) string tuple from an identifier (PRIVATE).""" + id, loc, strand = identifier.split(":") + start, end = map(int, loc.split("-")) + start -= 1 + return id, start, end, strand + + +class MauveWriter(SequentialAlignmentWriter): + """Mauve/XMFA alignment writer.""" + + def __init__(self, *args, **kwargs): + """Initialize the class.""" + super().__init__(*args, **kwargs) + self._wrote_header = False + self._wrote_first = False + + def write_alignment(self, alignment): + """Use this to write (another) single alignment to an open file. + + Note that sequences and their annotation are recorded + together (rather than having a block of annotation followed + by a block of aligned sequences). + """ + count = len(alignment) + + self._length_of_sequences = alignment.get_alignment_length() + + # NOTE - For now, the alignment object does not hold any per column + # or per alignment annotation - only per sequence. + + if count == 0: + raise ValueError("Must have at least one sequence") + if self._length_of_sequences == 0: + raise ValueError("Non-empty sequences are required") + + if not self._wrote_header: + self._wrote_header = True + self.handle.write("#FormatVersion Mauve1\n") + # There are some more headers, but we ignore those for now. + # Sequence1File unknown.fa + # Sequence1Entry 1 + # Sequence1Format FastA + for i in range(1, count + 1): + self.handle.write("#Sequence%sEntry\t%s\n" % (i, i)) + + for idx, record in enumerate(alignment): + self._write_record(record, record_idx=idx) + self.handle.write("=\n") + + def _write_record(self, record, record_idx=0): + """Write a single SeqRecord to the file (PRIVATE).""" + if self._length_of_sequences != len(record.seq): + raise ValueError("Sequences must all be the same length") + + seq_name = record.name + try: + seq_name = str(int(record.name)) + except ValueError: + seq_name = str(record_idx + 1) + + # We remove the "/{start}-{end}" before writing, as it cannot be part + # of the produced XMFA file. + if "start" in record.annotations and "end" in record.annotations: + suffix0 = "/%s-%s" % ( + record.annotations["start"], + record.annotations["end"], + ) + suffix1 = "/%s-%s" % ( + record.annotations["start"] + 1, + record.annotations["end"], + ) + if seq_name[-len(suffix0) :] == suffix0: + seq_name = seq_name[: -len(suffix0)] + if seq_name[-len(suffix1) :] == suffix1: + seq_name = seq_name[: -len(suffix1)] + + if ( + "start" in record.annotations + and "end" in record.annotations + and "strand" in record.annotations + ): + id_line = ID_LINE_FMT.format( + seq_name=seq_name, + start=record.annotations["start"] + 1, + end=record.annotations["end"], + strand=("+" if record.annotations["strand"] == 1 else "-"), + filename=record.name + ".fa", + ugly_hack=record.id, + ) + lacking_annotations = False + else: + id_line = ID_LINE_FMT.format( + seq_name=seq_name, + start=0, + end=0, + strand="+", + filename=record.name + ".fa", + ugly_hack=record.id, + ) + lacking_annotations = True + + # If the sequence is an empty one, skip writing it out + if (":0-0 " in id_line or ":1-0 " in id_line) and not lacking_annotations: + # Except in the first LCB + if not self._wrote_first: + self._wrote_first = True + # The first LCB we write out is special, and must list ALL + # sequences, for the Mauve GUI + # http://darlinglab.org/mauve/user-guide/files.html#non-standard-xmfa-formatting-used-by-the-mauve-gui + id_line = ID_LINE_FMT.format( + seq_name=seq_name, + start=0, + end=0, + strand="+", + filename=record.name + ".fa", + ugly_hack=record.id, + ) + id_line = id_line.replace("\n", " ").replace("\r", " ") + self.handle.write(id_line + "\n\n") + # Alignments lacking a start/stop/strand were generated by + # Biopython on load, and shouldn't exist according to XMFA + else: + # In other blocks, we only write sequences if they exist in a given + # alignment. + id_line = id_line.replace("\n", " ").replace("\r", " ") + self.handle.write(id_line + "\n") + for i in range(0, len(record.seq), 80): + self.handle.write("%s\n" % record.seq[i : i + 80]) + + +class MauveIterator(AlignmentIterator): + """Mauve xmfa alignment iterator.""" + + _ids = [] # for caching IDs between __next__ calls + + def __next__(self): + """Parse the next alignment from the handle.""" + handle = self.handle + line = handle.readline() + + if not line: + raise StopIteration + + # Strip out header comments + while line and line.strip().startswith("#"): + line = handle.readline() + + seqs = {} + seq_regions = {} + passed_end_alignment = False + + latest_id = None + while True: + if not line: + break # end of file + line = line.strip() + + if line.startswith("="): + # There may be more data, but we've reached the end of this + # alignment + break + elif line.startswith(">"): + m = XMFA_HEADER_REGEX_BIOPYTHON.match(line) + if not m: + m = XMFA_HEADER_REGEX.match(line) + if not m: + raise ValueError("Malformed header line: %s", line) + + parsed_id = m.group("id") + parsed_data = {} + for key in ("start", "end", "id", "strand", "name", "realname"): + try: + value = m.group(key) + if key == "start": + value = int(value) + # Convert to zero based counting + if value > 0: + value -= 1 + + if key == "end": + value = int(value) + parsed_data[key] = value + except IndexError: + # This will occur if we're asking for a group that + # doesn't exist. It's fine. + pass + seq_regions[parsed_id] = parsed_data + + if parsed_id not in self._ids: + self._ids.append(parsed_id) + + seqs.setdefault(parsed_id, "") + latest_id = parsed_id + else: + assert not passed_end_alignment + if latest_id is None: + raise ValueError("Saw sequence before definition line") + seqs[latest_id] += line + line = handle.readline() + + assert len(seqs) <= len(self._ids) + + self.ids = self._ids + self.sequences = seqs + + if self._ids and seqs: + alignment_length = max(map(len, list(seqs.values()))) + records = [] + for id in self._ids: + if id not in seqs or len(seqs[id]) == 0 or len(seqs[id]) == 0: + seq = "-" * alignment_length + else: + seq = seqs[id] + + if alignment_length != len(seq): + raise ValueError( + "Sequences have different lengths, or repeated identifier" + ) + + # Sometimes we don't see a particular sequence in the + # alignment, so we skip that record since it isn't present in + # that LCB/alignment + if id not in seq_regions: + continue + + if seq_regions[id]["start"] != 0 or seq_regions[id]["end"] != 0: + suffix = "/{start}-{end}".format(**seq_regions[id]) + if "realname" in seq_regions[id]: + corrected_id = seq_regions[id]["realname"] + else: + corrected_id = seq_regions[id]["name"] + if corrected_id.count(suffix) == 0: + corrected_id += suffix + else: + if "realname" in seq_regions[id]: + corrected_id = seq_regions[id]["realname"] + else: + corrected_id = seq_regions[id]["name"] + + record = SeqRecord(Seq(seq), id=corrected_id, name=id) + + record.annotations["start"] = seq_regions[id]["start"] + record.annotations["end"] = seq_regions[id]["end"] + record.annotations["strand"] = ( + 1 if seq_regions[id]["strand"] == "+" else -1 + ) + + records.append(record) + return MultipleSeqAlignment(records) + else: + raise StopIteration diff --git a/code/lib/Bio/AlignIO/MsfIO.py b/code/lib/Bio/AlignIO/MsfIO.py new file mode 100644 index 0000000..d620f1b --- /dev/null +++ b/code/lib/Bio/AlignIO/MsfIO.py @@ -0,0 +1,331 @@ +# Copyright 2019, National Marrow Donor Program (NMPD). All rights reserved. +# Written by Peter Cock, The James Hutton Institute, under contract to NMDP. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.AlignIO support for GCG MSF format. + +The file format was produced by the GCG PileUp and and LocalPileUp tools, +and later tools such as T-COFFEE and MUSCLE support it as an optional +output format. + +The original GCG tool would write gaps at ends of each sequence which could +be missing data as tildes (``~``), whereas internal gaps were periods (``.``) +instead. This parser replaces both with minus signs (``-``) for consistency +with the rest of ``Bio.AlignIO``. + +You are expected to use this module via the Bio.AlignIO functions (or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). +""" +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from .Interfaces import AlignmentIterator + + +class MsfIterator(AlignmentIterator): + """GCG MSF alignment iterator.""" + + _header = None # for caching lines between __next__ calls + + def __next__(self): + """Parse the next alignment from the handle.""" + handle = self.handle + + if self._header is None: + line = handle.readline() + else: + # Header we saved from when we were parsing + # the previous alignment. + line = self._header + self._header = None + + if not line: + raise StopIteration + + # Whitelisted headers we know about. + known_headers = ["!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp"] + # Examples in "Molecular Biology Software Training Manual GCG version 10" + # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001 + # would often start as follows: + # + # !!AA_MUTIPLE_ALIGNMENT 1.0 + # PileUp of: @/usr/users2/culhane/... + # + # etc with other seemingly free format text before getting to the + # MSF/Type/Check line and the following Name: lines block and // line. + # + # MUSCLE just has a line "PileUp", while other sources just use the line + # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT" + # (nucleotide). + if line.strip().split()[0] not in known_headers: + raise ValueError( + "%s is not a known GCG MSF header: %s" + % (line.strip().split()[0], ", ".join(known_headers)) + ) + + while line and " MSF: " not in line: + line = handle.readline() + + if not line: + raise ValueError("Reached end of file without MSF/Type/Check header line") + + # Quoting from "Molecular Biology Software Training Manual GCG version 10" + # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001. + # Page 31: + # + # "Header information is before a .. (double dot) in a GCG format file. + # The file will also have a checksum specific for that file." + # + # This was followed by a single non-aligned sequence, but this convention + # appears to also be used in the GCG MSF files. Quoting other examples in + # this reference, page 31: + # + # localpileup_17.msf MSF: 195 Type: P January 6, 2000 15:41 Check: 4365 .. + # + # Except from page 148: + # + # localpileup_106.msf MSF: 457 Type: P November 28, 2000 16:09 Check: 2396 .. + # + # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum: + # + # MSF: 689 Type: N Check: 0000 .. + # + # By observation, the MSF value is the column count, type is N (nucleotide) + # or P (protein / amino acid). + # + # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown, + # + # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf + # !!NA_MULTIPLE_ALIGNMENT 1.0 + # + # stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 .. + # + # Name: G26680 Len: 633 Check: 4334 Weight: 1.00 + # Name: G26685 Len: 633 Check: 3818 Weight: 1.00 + # Name: G29385 Len: 633 Check: 391 Weight: 1.00 + # + # // + # + parts = line.strip("\n").split() + offset = parts.index("MSF:") + if ( + parts[offset + 2] != "Type:" + or parts[-3] not in ("Check:", "CompCheck:") + or parts[-1] != ".." + ): + raise ValueError( + "GCG MSF header line should be " + "' MSF: Type: Check: ..', " + " not: %r" % line + ) + try: + aln_length = int(parts[offset + 1]) + except ValueError: + aln_length = -1 + if aln_length < 0: + raise ValueError( + "GCG MSF header line should have MDF: for column count, not %r" + % parts[offset + 1] + ) + seq_type = parts[offset + 3] + if seq_type not in ["P", "N"]: + raise ValueError( + "GCG MSF header line should have 'Type: P' (protein) " + "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type + ) + + # There should be a blank line after that header line, then the Name: lines + # + # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here, + # + # PileUp + # + # + # + # MSF: 628 Type: P Check: 147 .. + # + # Name: AK1H_ECOLI/1-378 oo Len: 628 Check: 3643 Weight: 1.000 + # Name: AKH_HAEIN/1-382 oo Len: 628 Check: 6504 Weight: 1.000 + # + # // + ids = [] + lengths = [] + checks = [] + weights = [] + line = handle.readline() + while line and line.strip() != "//": + line = handle.readline() + if line.strip().startswith("Name: "): + if " Len: " in line and " Check: " in line and " Weight: " in line: + rest = line[line.index("Name: ") + 6 :].strip() + name, rest = rest.split(" Len: ") + length, rest = rest.split(" Check: ") + check, weight = rest.split(" Weight: ") + name = name.strip() + if name.endswith(" oo"): + # T-COFFEE oddity, ignore this + name = name[:-3] + if name in ids: + raise ValueError("Duplicated ID of %r" % name) + if " " in name: + raise NotImplementedError("Space in ID %r" % name) + ids.append(name) + # Expect aln_length <= int(length.strip()), see below + lengths.append(int(length.strip())) + checks.append(int(check.strip())) + weights.append(float(weight.strip())) + else: + raise ValueError("Malformed GCG MSF name line: %r" % line) + if not line: + raise ValueError("End of file while looking for end of header // line.") + + if aln_length != max(lengths): + # In broken examples from IMGTHLA was possible to continue + # https://github.com/ANHIG/IMGTHLA/issues/201 + max_length = max(lengths) + max_count = sum(1 for _ in lengths if _ == max_length) + raise ValueError( + "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s" + % (aln_length, max_count, len(ids), max_length) + ) + + line = handle.readline() + if not line: + raise ValueError("End of file after // line, expected sequences.") + if line.strip(): + raise ValueError("After // line, expected blank line before sequences.") + + # Now load the sequences + seqs = [[] for _ in ids] # list of empty lists + completed_length = 0 + while completed_length < aln_length: + # Note might have a coordinate header line (seems to be optional) + for idx, name in enumerate(ids): + line = handle.readline() + if idx == 0 and not line.strip(): + # T-COFFEE uses two blank lines between blocks, rather than one + while line and not line.strip(): + line = handle.readline() + if not line: + raise ValueError("End of file where expecting sequence data.") + # print("Looking for seq for %s in line: %r" % (name, line)) + words = line.strip().split() + # Should we use column numbers, rather than assuming no spaces in names? + if idx == 0 and words and words[0] != name: + # print("Actually have a coord line") + # Hopefully this is a coordinate header before the first seq + try: + i = int(words[0]) + except ValueError: + i = -1 + if i != completed_length + 1: + raise ValueError( + "Expected GCG MSF coordinate line starting %i, got: %r" + % (completed_length + 1, line) + ) + if len(words) > 1: + # Final block usually not full 50 chars, so expect start only. + if len(words) != 2: + i = -1 + else: + try: + i = int(words[1]) + except ValueError: + i = -1 + if i != ( + completed_length + 50 + if completed_length + 50 < aln_length + else aln_length + ): + raise ValueError( + "Expected GCG MSF coordinate line %i to %i, got: %r" + % ( + completed_length + 1, + completed_length + 50 + if completed_length + 50 < aln_length + else aln_length, + line, + ) + ) + line = handle.readline() + words = line.strip().split() + # print("Still looking for seq for %s in line: %r" % (name, line)) + # Dealt with any coordinate header line, should now be sequence + if not words: + # Should be sequence here, but perhaps its a short one? + if ( + lengths[idx] < aln_length + and len("".join(seqs[idx])) == lengths[idx] + ): + # Is this actually allowed in the format? Personally I would + # expect a line with name and a block of trailing ~ here. + pass + else: + raise ValueError( + "Expected sequence for %s, got: %r" % (name, line) + ) + elif words[0] == name: + assert len(words) > 1, line + # print(i, name, repr(words)) + seqs[idx].extend(words[1:]) + else: + raise ValueError("Expected sequence for %r, got: %r" % (name, line)) + # TODO - check the sequence lengths thus far are consistent + # with blocks of 50? + completed_length += 50 + line = handle.readline() + if line.strip(): + raise ValueError("Expected blank line, got: %r" % line) + + # Skip over any whitespace at the end... + while True: + line = handle.readline() + if not line: + # End of file, no more alignments + break + elif not line.strip(): + # Blank line, ignore + pass + elif line.strip().split()[0] in known_headers: + # Looks like the start of another alignment: + self._header = line + break + else: + raise ValueError("Unexpected line after GCG MSF alignment: %r" % line) + + # Combine list of strings into single string, remap gaps + seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs] + + # Apply any trailing padding for short sequences + padded = False + for idx, (length, s) in enumerate(zip(lengths, seqs)): + if len(s) < aln_length and len(s) == length: + padded = True + seqs[idx] = s + "-" * (aln_length - len(s)) + if padded: + import warnings + from Bio import BiopythonParserWarning + + warnings.warn( + "One of more alignment sequences were truncated and have been gap padded", + BiopythonParserWarning, + ) + + records = ( + SeqRecord(Seq(s), id=i, name=i, description=i, annotations={"weight": w},) + for (i, s, w) in zip(ids, seqs, weights) + ) + + # This will check alignment lengths are self-consistent: + align = MultipleSeqAlignment(records) + # Check matches the header: + if align.get_alignment_length() != aln_length: + raise ValueError( + "GCG MSF headers said alignment length %i, but have %i" + % (aln_length, align.get_alignment_length()) + ) + return align diff --git a/code/lib/Bio/AlignIO/NexusIO.py b/code/lib/Bio/AlignIO/NexusIO.py new file mode 100644 index 0000000..2c97e2e --- /dev/null +++ b/code/lib/Bio/AlignIO/NexusIO.py @@ -0,0 +1,166 @@ +# Copyright 2008-2010, 2012-2014, 2016-2017 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.AlignIO support for the "nexus" file format. + +You are expected to use this module via the Bio.AlignIO functions (or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). + +See also the Bio.Nexus module (which this code calls internally), +as this offers more than just accessing the alignment or its +sequences as SeqRecord objects. +""" +from Bio.Align import MultipleSeqAlignment +from Bio.AlignIO.Interfaces import AlignmentWriter +from Bio.Nexus import Nexus +from Bio.SeqRecord import SeqRecord + + +# You can get a couple of example files here: +# http://www.molecularevolution.org/resources/fileformats/ + + +# This is a generator function! +def NexusIterator(handle, seq_count=None): + """Return SeqRecord objects from a Nexus file. + + Thus uses the Bio.Nexus module to do the hard work. + + You are expected to call this function via Bio.SeqIO or Bio.AlignIO + (and not use it directly). + + NOTE - We only expect ONE alignment matrix per Nexus file, + meaning this iterator will only yield one MultipleSeqAlignment. + """ + n = Nexus.Nexus(handle) + if not n.matrix: + # No alignment found + return + + # Bio.Nexus deals with duplicated names by adding a '.copy' suffix. + # The original names and the modified names are kept in these two lists: + assert len(n.unaltered_taxlabels) == len(n.taxlabels) + + if seq_count and seq_count != len(n.unaltered_taxlabels): + raise ValueError( + "Found %i sequences, but seq_count=%i" + % (len(n.unaltered_taxlabels), seq_count) + ) + + # TODO - Can we extract any annotation too? + if n.datatype in ("dna", "nucleotide"): + annotations = {"molecule_type": "DNA"} + elif n.datatype == "rna": + annotations = {"molecule_type": "RNA"} + elif n.datatype == "protein": + annotations = {"molecule_type": "protein"} + else: + annotations = None + records = ( + SeqRecord( + n.matrix[new_name], + id=new_name, + name=old_name, + description="", + annotations=annotations, + ) + for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels) + ) + # All done + yield MultipleSeqAlignment(records) + + +class NexusWriter(AlignmentWriter): + """Nexus alignment writer. + + Note that Nexus files are only expected to hold ONE alignment + matrix. + + You are expected to call this class via the Bio.AlignIO.write() or + Bio.SeqIO.write() functions. + """ + + def write_file(self, alignments): + """Use this to write an entire file containing the given alignments. + + Arguments: + - alignments - A list or iterator returning MultipleSeqAlignment objects. + This should hold ONE and only one alignment. + + """ + align_iter = iter(alignments) # Could have been a list + try: + alignment = next(align_iter) + except StopIteration: + # Nothing to write! + return 0 + + # Check there is only one alignment... + try: + next(align_iter) + raise ValueError("We can only write one Alignment to a Nexus file.") + except StopIteration: + pass + + # Good. Actually write the single alignment, + self.write_alignment(alignment) + return 1 # we only support writing one alignment! + + def write_alignment(self, alignment, interleave=None): + """Write an alignment to file. + + Creates an empty Nexus object, adds the sequences + and then gets Nexus to prepare the output. + Default interleave behaviour: Interleave if columns > 1000 + --> Override with interleave=[True/False] + """ + if len(alignment) == 0: + raise ValueError("Must have at least one sequence") + columns = alignment.get_alignment_length() + if columns == 0: + raise ValueError("Non-empty sequences are required") + datatype = self._classify_mol_type_for_nexus(alignment) + minimal_record = ( + "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=%s; end;" + % datatype + ) + n = Nexus.Nexus(minimal_record) + for record in alignment: + # Sanity test sequences (should this be even stricter?) + if datatype == "dna" and "U" in record.seq: + raise ValueError(f"{record.id} contains U, but DNA alignment") + elif datatype == "rna" and "T" in record.seq: + raise ValueError(f"{record.id} contains T, but RNA alignment") + n.add_sequence(record.id, str(record.seq)) + + # Note: MrBayes may choke on large alignments if not interleaved + if interleave is None: + interleave = columns > 1000 + n.write_nexus_data(self.handle, interleave=interleave) + + def _classify_mol_type_for_nexus(self, alignment): + """Return 'protein', 'dna', or 'rna' based on records' molecule type (PRIVATE). + + All the records must have a molecule_type annotation, and they must + agree. + + Raises an exception if this is not possible. + """ + values = {_.annotations.get("molecule_type", None) for _ in alignment} + if all(_ and "DNA" in _ for _ in values): + return "dna" # could have been a mix of "DNA" and "gDNA" + elif all(_ and "RNA" in _ for _ in values): + return "rna" # could have been a mix of "RNA" and "mRNA" + elif all(_ and "protein" in _ for _ in values): + return "protein" + else: + raise ValueError("Need the molecule type to be defined") + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest(verbose=0) diff --git a/code/lib/Bio/AlignIO/PhylipIO.py b/code/lib/Bio/AlignIO/PhylipIO.py new file mode 100644 index 0000000..cc3f665 --- /dev/null +++ b/code/lib/Bio/AlignIO/PhylipIO.py @@ -0,0 +1,454 @@ +# Copyright 2006-2016 by Peter Cock. All rights reserved. +# Revisions copyright 2011 Brandon Invergo. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools. + +You are expected to use this module via the Bio.AlignIO functions (or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). + +Support for "relaxed phylip" format is also provided. Relaxed phylip differs +from standard phylip format in the following ways: + + - No whitespace is allowed in the sequence ID. + - No truncation is performed. Instead, sequence IDs are padded to the longest + ID length, rather than 10 characters. A space separates the sequence + identifier from the sequence. + +Relaxed phylip is supported by RAxML and PHYML. + +Note +==== + +In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) +a dot/period (".") in a sequence is interpreted as meaning the same +character as in the first sequence. The PHYLIP documentation from 3.3 to 3.69 +http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: + +"a period was also previously allowed but it is no longer allowed, +because it sometimes is used in different senses in other programs" + +Biopython 1.58 or later treats dots/periods in the sequence as invalid, both +for reading and writing. Older versions did nothing special with a dot/period. +""" +import string + +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from .Interfaces import AlignmentIterator +from .Interfaces import SequentialAlignmentWriter + + +_PHYLIP_ID_WIDTH = 10 +_NO_DOTS = "PHYLIP format no longer allows dots in sequence" + + +class PhylipWriter(SequentialAlignmentWriter): + """Phylip alignment writer.""" + + def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH): + """Use this to write (another) single alignment to an open file. + + This code will write interlaced alignments (when the sequences are + longer than 50 characters). + + Note that record identifiers are strictly truncated to id_width, + defaulting to the value required to comply with the PHYLIP standard. + + For more information on the file format, please see: + http://evolution.genetics.washington.edu/phylip/doc/sequence.html + http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles + """ + handle = self.handle + + if len(alignment) == 0: + raise ValueError("Must have at least one sequence") + length_of_seqs = alignment.get_alignment_length() + for record in alignment: + if length_of_seqs != len(record.seq): + raise ValueError("Sequences must all be the same length") + if length_of_seqs <= 0: + raise ValueError("Non-empty sequences are required") + + # Check for repeated identifiers... + # Apply this test *after* cleaning the identifiers + names = [] + seqs = [] + for record in alignment: + """ + Quoting the PHYLIP version 3.6 documentation: + + The name should be ten characters in length, filled out to + the full ten characters by blanks if shorter. Any printable + ASCII/ISO character is allowed in the name, except for + parentheses ("(" and ")"), square brackets ("[" and "]"), + colon (":"), semicolon (";") and comma (","). If you forget + to extend the names to ten characters in length by blanks, + the program [i.e. PHYLIP] will get out of synchronization + with the contents of the data file, and an error message will + result. + + Note that Tab characters count as only one character in the + species names. Their inclusion can cause trouble. + """ + name = sanitize_name(record.id, id_width) + if name in names: + raise ValueError( + "Repeated name %r (originally %r), possibly due to truncation" + % (name, record.id) + ) + names.append(name) + sequence = str(record.seq) + if "." in sequence: + # Do this check here (once per record, not once per block) + raise ValueError(_NO_DOTS) + seqs.append(sequence) + + # From experimentation, the use of tabs is not understood by the + # EMBOSS suite. The nature of the expected white space is not + # defined in the PHYLIP documentation, simply "These are in free + # format, separated by blanks". We'll use spaces to keep EMBOSS + # happy. + handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) + block = 0 + while True: + for name, sequence in zip(names, seqs): + if block == 0: + # Write name (truncated/padded to id_width characters) + # Now truncate and right pad to expected length. + handle.write(name[:id_width].ljust(id_width)) + else: + # write indent + handle.write(" " * id_width) + # Write five chunks of ten letters per line... + for chunk in range(0, 5): + i = block * 50 + chunk * 10 + seq_segment = sequence[i : i + 10] + # TODO - Force any gaps to be '-' character? + # TODO - How to cope with '?' or '.' in the sequence? + handle.write(" %s" % seq_segment) + if i + 10 > length_of_seqs: + break + handle.write("\n") + block += 1 + if block * 50 >= length_of_seqs: + break + handle.write("\n") + + +class PhylipIterator(AlignmentIterator): + """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. + + Record identifiers are limited to at most 10 characters. + + It only copes with interlaced phylip files! Sequential files won't work + where the sequences are split over multiple lines. + + For more information on the file format, please see: + http://evolution.genetics.washington.edu/phylip/doc/sequence.html + http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles + """ + + # Default truncation length + id_width = _PHYLIP_ID_WIDTH + + _header = None # for caching lines between __next__ calls + + def _is_header(self, line): + line = line.strip() + parts = [x for x in line.split() if x] + if len(parts) != 2: + return False # First line should have two integers + try: + number_of_seqs = int(parts[0]) + length_of_seqs = int(parts[1]) + return True + except ValueError: + return False # First line should have two integers + + def _split_id(self, line): + """Extract the sequence ID from a Phylip line (PRIVATE). + + Returning a tuple containing: (sequence_id, sequence_residues) + + The first 10 characters in the line are are the sequence id, the + remainder are sequence data. + """ + seq_id = line[: self.id_width].strip() + seq = line[self.id_width :].strip().replace(" ", "") + return seq_id, seq + + def __next__(self): + """Parse the next alignment from the handle.""" + handle = self.handle + + if self._header is None: + line = handle.readline() + else: + # Header we saved from when we were parsing + # the previous alignment. + line = self._header + self._header = None + + if not line: + raise StopIteration + line = line.strip() + parts = [x for x in line.split() if x] + if len(parts) != 2: + raise ValueError("First line should have two integers") + try: + number_of_seqs = int(parts[0]) + length_of_seqs = int(parts[1]) + except ValueError: + raise ValueError("First line should have two integers") from None + + assert self._is_header(line) + + if ( + self.records_per_alignment is not None + and self.records_per_alignment != number_of_seqs + ): + raise ValueError( + "Found %i records in this alignment, told to expect %i" + % (number_of_seqs, self.records_per_alignment) + ) + + ids = [] + seqs = [] + + # By default, expects STRICT truncation / padding to 10 characters. + # Does not require any whitespace between name and seq. + for i in range(number_of_seqs): + line = handle.readline().rstrip() + sequence_id, s = self._split_id(line) + ids.append(sequence_id) + if "." in s: + raise ValueError(_NO_DOTS) + seqs.append([s]) + + # Look for further blocks + line = "" + while True: + # Skip any blank lines between blocks... + while "" == line.strip(): + line = handle.readline() + if not line: + break # end of file + if not line: + break # end of file + + if self._is_header(line): + # Looks like the start of a concatenated alignment + self._header = line + break + + # print("New block...") + for i in range(number_of_seqs): + s = line.strip().replace(" ", "") + if "." in s: + raise ValueError(_NO_DOTS) + seqs[i].append(s) + line = handle.readline() + if (not line) and i + 1 < number_of_seqs: + raise ValueError("End of file mid-block") + if not line: + break # end of file + + records = ( + SeqRecord(Seq("".join(s)), id=i, name=i, description=i) + for (i, s) in zip(ids, seqs) + ) + return MultipleSeqAlignment(records) + + +# Relaxed Phylip +class RelaxedPhylipWriter(PhylipWriter): + """Relaxed Phylip format writer.""" + + def write_alignment(self, alignment): + """Write a relaxed phylip alignment.""" + # Check inputs + for name in (s.id.strip() for s in alignment): + if any(c in name for c in string.whitespace): + raise ValueError("Whitespace not allowed in identifier: %s" % name) + + # Calculate a truncation length - maximum length of sequence ID plus a + # single character for padding + # If no sequences, set id_width to 1. super(...) call will raise a + # ValueError + if len(alignment) == 0: + id_width = 1 + else: + id_width = max(len(s.id.strip()) for s in alignment) + 1 + super().write_alignment(alignment, id_width) + + +class RelaxedPhylipIterator(PhylipIterator): + """Relaxed Phylip format Iterator.""" + + def _split_id(self, line): + """Extract the sequence ID from a Phylip line (PRIVATE). + + Returns a tuple containing: (sequence_id, sequence_residues) + + For relaxed format split at the first whitespace character. + """ + seq_id, sequence = line.split(None, 1) + sequence = sequence.strip().replace(" ", "") + return seq_id, sequence + + +class SequentialPhylipWriter(SequentialAlignmentWriter): + """Sequential Phylip format Writer.""" + + def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH): + """Write a Phylip alignment to the handle.""" + handle = self.handle + + if len(alignment) == 0: + raise ValueError("Must have at least one sequence") + length_of_seqs = alignment.get_alignment_length() + for record in alignment: + if length_of_seqs != len(record.seq): + raise ValueError("Sequences must all be the same length") + if length_of_seqs <= 0: + raise ValueError("Non-empty sequences are required") + + # Check for repeated identifiers... + # Apply this test *after* cleaning the identifiers + names = [] + for record in alignment: + # Either remove the banned characters, or map them to something + # else like an underscore "_" or pipe "|" character... + name = sanitize_name(record.id, id_width) + if name in names: + raise ValueError( + "Repeated name %r (originally %r), possibly due to truncation" + % (name, record.id) + ) + names.append(name) + + # From experimentation, the use of tabs is not understood by the + # EMBOSS suite. The nature of the expected white space is not + # defined in the PHYLIP documentation, simply "These are in free + # format, separated by blanks". We'll use spaces to keep EMBOSS + # happy. + handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) + for name, record in zip(names, alignment): + sequence = str(record.seq) + if "." in sequence: + raise ValueError(_NO_DOTS) + handle.write(name[:id_width].ljust(id_width)) + # Write the entire sequence to one line (see sequential format + # notes in the SequentialPhylipIterator docstring + handle.write(sequence) + handle.write("\n") + + +class SequentialPhylipIterator(PhylipIterator): + """Sequential Phylip format Iterator. + + The sequential format carries the same restrictions as the normal + interleaved one, with the difference being that the sequences are listed + sequentially, each sequence written in its entirety before the start of + the next. According to the PHYLIP documentation for input file + formatting, newlines and spaces may optionally be entered at any point + in the sequences. + """ + + _header = None # for caching lines between __next__ calls + + def __next__(self): + """Parse the next alignment from the handle.""" + handle = self.handle + + if self._header is None: + line = handle.readline() + else: + # Header we saved from when we were parsing + # the previous alignment. + line = self._header + self._header = None + + if not line: + raise StopIteration + line = line.strip() + parts = [x for x in line.split() if x] + if len(parts) != 2: + raise ValueError("First line should have two integers") + try: + number_of_seqs = int(parts[0]) + length_of_seqs = int(parts[1]) + except ValueError: + raise ValueError("First line should have two integers") from None + + assert self._is_header(line) + + if ( + self.records_per_alignment is not None + and self.records_per_alignment != number_of_seqs + ): + raise ValueError( + "Found %i records in this alignment, told to expect %i" + % (number_of_seqs, self.records_per_alignment) + ) + + ids = [] + seqs = [] + + # By default, expects STRICT truncation / padding to 10 characters. + # Does not require any whitespace between name and seq. + for i in range(number_of_seqs): + line = handle.readline().rstrip() + sequence_id, s = self._split_id(line) + ids.append(sequence_id) + while len(s) < length_of_seqs: + # The sequence may be split into multiple lines + line = handle.readline().strip() + if not line: + break + if line == "": + continue + s = "".join([s, line.strip().replace(" ", "")]) + if len(s) > length_of_seqs: + raise ValueError( + "Found a record of length %i, " + "should be %i" % (len(s), length_of_seqs) + ) + if "." in s: + raise ValueError(_NO_DOTS) + seqs.append(s) + while True: + # Find other alignments in the file + line = handle.readline() + if not line: + break + if self._is_header(line): + self._header = line + break + + records = ( + SeqRecord(Seq(s), id=i, name=i, description=i) for (i, s) in zip(ids, seqs) + ) + return MultipleSeqAlignment(records) + + +def sanitize_name(name, width=None): + """Sanitise sequence identifier for output. + + Removes the banned characters "[]()" and replaces the characters ":;" + with "|". The name is truncated to "width" characters if specified. + """ + name = name.strip() + for char in "[](),": + name = name.replace(char, "") + for char in ":;": + name = name.replace(char, "|") + if width is not None: + name = name[:width] + return name diff --git a/code/lib/Bio/AlignIO/StockholmIO.py b/code/lib/Bio/AlignIO/StockholmIO.py new file mode 100644 index 0000000..386e762 --- /dev/null +++ b/code/lib/Bio/AlignIO/StockholmIO.py @@ -0,0 +1,630 @@ +# Copyright 2006-2016 by Peter Cock. All rights reserved. +# Revisions copyright 2015 by Ben Woodcroft. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.AlignIO support for "stockholm" format (used in the PFAM database). + +You are expected to use this module via the Bio.AlignIO functions (or the +Bio.SeqIO functions if you want to work directly with the gapped sequences). + +For example, consider a Stockholm alignment file containing the following:: + + # STOCKHOLM 1.0 + #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>.. + AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU + #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..-- + AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU + #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>---- + + #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>............... + AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU + #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>--------------- + AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU + #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>--------------- + // + +This is a single multiple sequence alignment, so you would probably load this +using the Bio.AlignIO.read() function: + + >>> from Bio import AlignIO + >>> align = AlignIO.read("Stockholm/simple.sth", "stockholm") + >>> print(align) + Alignment with 2 rows and 104 columns + UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1 + AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1 + >>> for record in align: + ... print("%s %i" % (record.id, len(record))) + AP001509.1 104 + AE007476.1 104 + +In addition to the sequences themselves, this example alignment also includes +some GR lines for the secondary structure of the sequences. These are +strings, with one character for each letter in the associated sequence: + + >>> for record in align: + ... print(record.id) + ... print(record.seq) + ... print(record.letter_annotations['secondary_structure']) + AP001509.1 + UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU + -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>--------------- + AE007476.1 + AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU + -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>--------------- + +Any general annotation for each row is recorded in the SeqRecord's annotations +dictionary. Any per-column annotation for the entire alignment in in the +alignment's column annotations dictionary, such as the secondary structure +consensus in this example: + + >>> sorted(align.column_annotations.keys()) + ['secondary_structure'] + >>> align.column_annotations["secondary_structure"] + '.................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............' + +You can output this alignment in many different file formats +using Bio.AlignIO.write(), or the MultipleSeqAlignment object's format method: + + >>> print(format(align, "fasta")) + >AP001509.1 + UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A + GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU + >AE007476.1 + AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA + GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU + + +Most output formats won't be able to hold the annotation possible in a +Stockholm file: + + >>> print(format(align, "stockholm")) + # STOCKHOLM 1.0 + #=GF SQ 2 + AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU + #=GS AP001509.1 AC AP001509.1 + #=GS AP001509.1 DE AP001509.1 + #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>--------------- + AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU + #=GS AE007476.1 AC AE007476.1 + #=GS AE007476.1 DE AE007476.1 + #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>--------------- + #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>............... + // + + +Note that when writing Stockholm files, AlignIO does not break long sequences +up and interleave them (as in the input file shown above). The standard +allows this simpler layout, and it is more likely to be understood by other +tools. + +Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to +iterate over the alignment rows as SeqRecord objects - rather than working +with Alignnment objects. + + >>> from Bio import SeqIO + >>> for record in SeqIO.parse("Stockholm/simple.sth", "stockholm"): + ... print(record.id) + ... print(record.seq) + ... print(record.letter_annotations['secondary_structure']) + AP001509.1 + UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU + -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>--------------- + AE007476.1 + AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU + -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>--------------- + +Remember that if you slice a SeqRecord, the per-letter-annotations like the +secondary structure string here, are also sliced: + + >>> sub_record = record[10:20] + >>> print(sub_record.seq) + AUCGUUUUAC + >>> print(sub_record.letter_annotations['secondary_structure']) + -------<<< + +Likewise with the alignment object, as long as you are not dropping any rows, +slicing specific columns of an alignment will slice any per-column-annotations: + + >>> align.column_annotations["secondary_structure"] + '.................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............' + >>> part_align = align[:,10:20] + >>> part_align.column_annotations["secondary_structure"] + '.......<<<' + +You can also see this in the Stockholm output of this partial-alignment: + + >>> print(format(part_align, "stockholm")) + # STOCKHOLM 1.0 + #=GF SQ 2 + AP001509.1 UCAACACUCU + #=GS AP001509.1 AC AP001509.1 + #=GS AP001509.1 DE AP001509.1 + #=GR AP001509.1 SS -------<<< + AE007476.1 AUCGUUUUAC + #=GS AE007476.1 AC AE007476.1 + #=GS AE007476.1 DE AE007476.1 + #=GR AE007476.1 SS -------<<< + #=GC SS_cons .......<<< + // + + +""" +from collections import OrderedDict + +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from .Interfaces import AlignmentIterator +from .Interfaces import SequentialAlignmentWriter + + +class StockholmWriter(SequentialAlignmentWriter): + """Stockholm/PFAM alignment writer.""" + + # These dictionaries should be kept in sync with those + # defined in the StockholmIterator class. + pfam_gr_mapping = { + "secondary_structure": "SS", + "surface_accessibility": "SA", + "transmembrane": "TM", + "posterior_probability": "PP", + "ligand_binding": "LI", + "active_site": "AS", + "intron": "IN", + } + # These GC mappings are in addition to *_cons in GR mapping: + pfam_gc_mapping = {"reference_annotation": "RF", "model_mask": "MM"} + # Following dictionary deliberately does not cover AC, DE or DR + pfam_gs_mapping = {"organism": "OS", "organism_classification": "OC", "look": "LO"} + + def write_alignment(self, alignment): + """Use this to write (another) single alignment to an open file. + + Note that sequences and their annotation are recorded + together (rather than having a block of annotation followed + by a block of aligned sequences). + """ + count = len(alignment) + + self._length_of_sequences = alignment.get_alignment_length() + self._ids_written = [] + + if count == 0: + raise ValueError("Must have at least one sequence") + if self._length_of_sequences == 0: + raise ValueError("Non-empty sequences are required") + + self.handle.write("# STOCKHOLM 1.0\n") + self.handle.write("#=GF SQ %i\n" % count) + for record in alignment: + self._write_record(record) + # This shouldn't be None... but just in case, + if alignment.column_annotations: + for k, v in sorted(alignment.column_annotations.items()): + if k in self.pfam_gc_mapping: + self.handle.write("#=GC %s %s\n" % (self.pfam_gc_mapping[k], v)) + elif k in self.pfam_gr_mapping: + self.handle.write( + "#=GC %s %s\n" % (self.pfam_gr_mapping[k] + "_cons", v) + ) + else: + # It doesn't follow the PFAM standards, but should we record + # this data anyway? + pass + self.handle.write("//\n") + + def _write_record(self, record): + """Write a single SeqRecord to the file (PRIVATE).""" + if self._length_of_sequences != len(record.seq): + raise ValueError("Sequences must all be the same length") + + # For the case for stockholm to stockholm, try and use record.name + seq_name = record.id + if record.name is not None: + if "accession" in record.annotations: + if record.id == record.annotations["accession"]: + seq_name = record.name + + # In the Stockholm file format, spaces are not allowed in the id + seq_name = seq_name.replace(" ", "_") + + if "start" in record.annotations and "end" in record.annotations: + suffix = "/%s-%s" % ( + record.annotations["start"], + record.annotations["end"], + ) + if seq_name[-len(suffix) :] != suffix: + seq_name = "%s/%s-%s" % ( + seq_name, + record.annotations["start"], + record.annotations["end"], + ) + + if seq_name in self._ids_written: + raise ValueError("Duplicate record identifier: %s" % seq_name) + self._ids_written.append(seq_name) + self.handle.write("%s %s\n" % (seq_name, record.seq)) + + # The recommended placement for GS lines (per sequence annotation) + # is above the alignment (as a header block) or just below the + # corresponding sequence. + # + # The recommended placement for GR lines (per sequence per column + # annotation such as secondary structure) is just below the + # corresponding sequence. + # + # We put both just below the corresponding sequence as this allows + # us to write the file using a single pass through the records. + + # AC = Accession + if "accession" in record.annotations: + self.handle.write( + "#=GS %s AC %s\n" + % (seq_name, self.clean(record.annotations["accession"])) + ) + elif record.id: + self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.id))) + + # DE = description + if record.description: + self.handle.write( + "#=GS %s DE %s\n" % (seq_name, self.clean(record.description)) + ) + + # DE = database links + for xref in record.dbxrefs: + self.handle.write("#=GS %s DR %s\n" % (seq_name, self.clean(xref))) + + # GS = other per sequence annotation + for key, value in record.annotations.items(): + if key in self.pfam_gs_mapping: + data = self.clean(str(value)) + if data: + self.handle.write( + "#=GS %s %s %s\n" + % (seq_name, self.clean(self.pfam_gs_mapping[key]), data) + ) + else: + # It doesn't follow the PFAM standards, but should we record + # this data anyway? + pass + + # GR = per row per column sequence annotation + for key, value in record.letter_annotations.items(): + if key in self.pfam_gr_mapping and len(str(value)) == len(record.seq): + data = self.clean(str(value)) + if data: + self.handle.write( + "#=GR %s %s %s\n" + % (seq_name, self.clean(self.pfam_gr_mapping[key]), data) + ) + else: + # It doesn't follow the PFAM standards, but should we record + # this data anyway? + pass + + +class StockholmIterator(AlignmentIterator): + """Loads a Stockholm file from PFAM into MultipleSeqAlignment objects. + + The file may contain multiple concatenated alignments, which are loaded + and returned incrementally. + + This parser will detect if the Stockholm file follows the PFAM + conventions for sequence specific meta-data (lines starting #=GS + and #=GR) and populates the SeqRecord fields accordingly. + + Any annotation which does not follow the PFAM conventions is currently + ignored. + + If an accession is provided for an entry in the meta data, IT WILL NOT + be used as the record.id (it will be recorded in the record's + annotations). This is because some files have (sub) sequences from + different parts of the same accession (differentiated by different + start-end positions). + + Wrap-around alignments are not supported - each sequences must be on + a single line. However, interlaced sequences should work. + + For more information on the file format, please see: + http://sonnhammer.sbc.su.se/Stockholm.html + https://en.wikipedia.org/wiki/Stockholm_format + http://bioperl.org/formats/alignment_formats/Stockholm_multiple_alignment_format.html + + For consistency with BioPerl and EMBOSS we call this the "stockholm" + format. + """ + + # These dictionaries should be kept in sync with those + # defined in the PfamStockholmWriter class. + pfam_gr_mapping = { + "SS": "secondary_structure", + "SA": "surface_accessibility", + "TM": "transmembrane", + "PP": "posterior_probability", + "LI": "ligand_binding", + "AS": "active_site", + "IN": "intron", + } + # These GC mappings are in addition to *_cons in GR mapping: + pfam_gc_mapping = {"RF": "reference_annotation", "MM": "model_mask"} + # Following dictionary deliberately does not cover AC, DE or DR + pfam_gs_mapping = {"OS": "organism", "OC": "organism_classification", "LO": "look"} + + _header = None # for caching lines between __next__ calls + + def __next__(self): + """Parse the next alignment from the handle.""" + handle = self.handle + + if self._header is None: + line = handle.readline() + else: + # Header we saved from when we were parsing + # the previous alignment. + line = self._header + self._header = None + + if not line: + # Empty file - just give up. + raise StopIteration + if line.strip() != "# STOCKHOLM 1.0": + raise ValueError("Did not find STOCKHOLM header") + + # Note: If this file follows the PFAM conventions, there should be + # a line containing the number of sequences, e.g. "#=GF SQ 67" + # We do not check for this - perhaps we should, and verify that + # if present it agrees with our parsing. + + seqs = {} + ids = OrderedDict() # Really only need an OrderedSet, but python lacks this + gs = {} + gr = {} + gf = {} + gc = {} + passed_end_alignment = False + while True: + line = handle.readline() + if not line: + break # end of file + line = line.strip() # remove trailing \n + if line == "# STOCKHOLM 1.0": + self._header = line + break + elif line == "//": + # The "//" line indicates the end of the alignment. + # There may still be more meta-data + passed_end_alignment = True + elif line == "": + # blank line, ignore + pass + elif line[0] != "#": + # Sequence + # Format: " " + assert not passed_end_alignment + parts = [x.strip() for x in line.split(" ", 1)] + if len(parts) != 2: + # This might be someone attempting to store a zero length sequence? + raise ValueError( + "Could not split line into identifier and sequence:\n" + line + ) + seq_id, seq = parts + if seq_id not in ids: + ids[seq_id] = True + seqs.setdefault(seq_id, "") + seqs[seq_id] += seq.replace(".", "-") + elif len(line) >= 5: + # Comment line or meta-data + if line[:5] == "#=GF ": + # Generic per-File annotation, free text + # Format: #=GF + feature, text = line[5:].strip().split(None, 1) + # Each feature key could be used more than once, + # so store the entries as a list of strings. + if feature not in gf: + gf[feature] = [text] + else: + gf[feature].append(text) + elif line[:5] == "#=GC ": + # Generic per-Column annotation, exactly 1 char per column + # Format: "#=GC " + feature, text = line[5:].strip().split(None, 2) + if feature not in gc: + gc[feature] = "" + gc[feature] += text.strip() # append to any previous entry + # Might be interleaved blocks, so can't check length yet + elif line[:5] == "#=GS ": + # Generic per-Sequence annotation, free text + # Format: "#=GS " + try: + seq_id, feature, text = line[5:].strip().split(None, 2) + except ValueError: + # Free text can sometimes be empty, which a one line split throws an error for. + # See https://github.com/biopython/biopython/issues/2982 for more details + seq_id, feature = line[5:].strip().split(None, 1) + text = "" + # if seq_id not in ids: + # ids.append(seq_id) + if seq_id not in gs: + gs[seq_id] = {} + if feature not in gs[seq_id]: + gs[seq_id][feature] = [text] + else: + gs[seq_id][feature].append(text) + elif line[:5] == "#=GR ": + # Generic per-Sequence AND per-Column markup + # Format: "#=GR " + seq_id, feature, text = line[5:].strip().split(None, 2) + # if seq_id not in ids: + # ids.append(seq_id) + if seq_id not in gr: + gr[seq_id] = {} + if feature not in gr[seq_id]: + gr[seq_id][feature] = "" + gr[seq_id][feature] += text.strip() # append to any previous entry + # Might be interleaved blocks, so can't check length yet + # Next line... + + assert len(seqs) <= len(ids) + # assert len(gs) <= len(ids) + # assert len(gr) <= len(ids) + + self.ids = ids.keys() + self.sequences = seqs + self.seq_annotation = gs + self.seq_col_annotation = gr + + if ids and seqs: + + if ( + self.records_per_alignment is not None + and self.records_per_alignment != len(ids) + ): + raise ValueError( + "Found %i records in this alignment, told to expect %i" + % (len(ids), self.records_per_alignment) + ) + + alignment_length = len(list(seqs.values())[0]) + records = [] # Alignment obj will put them all in a list anyway + for seq_id in ids: + seq = seqs[seq_id] + if alignment_length != len(seq): + raise ValueError( + "Sequences have different lengths, or repeated identifier" + ) + name, start, end = self._identifier_split(seq_id) + record = SeqRecord( + Seq(seq), + id=seq_id, + name=name, + description=seq_id, + annotations={"accession": name}, + ) + # Accession will be overridden by _populate_meta_data if an explicit + # accession is provided: + record.annotations["accession"] = name + + if start is not None: + record.annotations["start"] = start + if end is not None: + record.annotations["end"] = end + + self._populate_meta_data(seq_id, record) + records.append(record) + for k, v in gc.items(): + if len(v) != alignment_length: + raise ValueError( + "%s length %i, expected %i" % (k, len(v), alignment_length) + ) + alignment = MultipleSeqAlignment(records) + + for k, v in sorted(gc.items()): + if k in self.pfam_gc_mapping: + alignment.column_annotations[self.pfam_gc_mapping[k]] = v + elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping: + alignment.column_annotations[self.pfam_gr_mapping[k[:-5]]] = v + else: + # Ignore it? + alignment.column_annotations["GC:" + k] = v + + # TODO - Introduce an annotated alignment class? + # For now, store the annotation a new private property: + alignment._annotations = gr + + return alignment + else: + raise StopIteration + + def _identifier_split(self, identifier): + """Return (name, start, end) string tuple from an identifier (PRIVATE).""" + if "/" in identifier: + name, start_end = identifier.rsplit("/", 1) + if start_end.count("-") == 1: + try: + start, end = start_end.split("-") + return name, int(start), int(end) + except ValueError: + # Non-integers after final '/' - fall through + pass + return identifier, None, None + + def _get_meta_data(self, identifier, meta_dict): + """Take an itentifier and returns dict of all meta-data matching it (PRIVATE). + + For example, given "Q9PN73_CAMJE/149-220" will return all matches to + this or "Q9PN73_CAMJE" which the identifier without its /start-end + suffix. + + In the example below, the suffix is required to match the AC, but must + be removed to match the OS and OC meta-data:: + + # STOCKHOLM 1.0 + #=GS Q9PN73_CAMJE/149-220 AC Q9PN73 + ... + Q9PN73_CAMJE/149-220 NKA... + ... + #=GS Q9PN73_CAMJE OS Campylobacter jejuni + #=GS Q9PN73_CAMJE OC Bacteria + + This function will return an empty dictionary if no data is found. + """ + name, start, end = self._identifier_split(identifier) + if name == identifier: + identifier_keys = [identifier] + else: + identifier_keys = [identifier, name] + answer = {} + for identifier_key in identifier_keys: + try: + for feature_key in meta_dict[identifier_key]: + answer[feature_key] = meta_dict[identifier_key][feature_key] + except KeyError: + pass + return answer + + def _populate_meta_data(self, identifier, record): + """Add meta-date to a SecRecord's annotations dictionary (PRIVATE). + + This function applies the PFAM conventions. + """ + seq_data = self._get_meta_data(identifier, self.seq_annotation) + for feature in seq_data: + # Note this dictionary contains lists! + if feature == "AC": # ACcession number + assert len(seq_data[feature]) == 1 + record.annotations["accession"] = seq_data[feature][0] + elif feature == "DE": # DEscription + record.description = "\n".join(seq_data[feature]) + elif feature == "DR": # Database Reference + # Should we try and parse the strings? + record.dbxrefs = seq_data[feature] + elif feature in self.pfam_gs_mapping: + record.annotations[self.pfam_gs_mapping[feature]] = ", ".join( + seq_data[feature] + ) + else: + # Ignore it? + record.annotations["GS:" + feature] = ", ".join(seq_data[feature]) + + # Now record the per-letter-annotations + seq_col_data = self._get_meta_data(identifier, self.seq_col_annotation) + for feature in seq_col_data: + # Note this dictionary contains strings! + if feature in self.pfam_gr_mapping: + record.letter_annotations[self.pfam_gr_mapping[feature]] = seq_col_data[ + feature + ] + else: + # Ignore it? + record.letter_annotations["GR:" + feature] = seq_col_data[feature] + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/AlignIO/__init__.py b/code/lib/Bio/AlignIO/__init__.py new file mode 100644 index 0000000..fe01f8f --- /dev/null +++ b/code/lib/Bio/AlignIO/__init__.py @@ -0,0 +1,480 @@ +# Copyright 2008-2018 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Multiple sequence alignment input/output as alignment objects. + +The Bio.AlignIO interface is deliberately very similar to Bio.SeqIO, and in +fact the two are connected internally. Both modules use the same set of file +format names (lower case strings). From the user's perspective, you can read +in a PHYLIP file containing one or more alignments using Bio.AlignIO, or you +can read in the sequences within these alignments using Bio.SeqIO. + +Bio.AlignIO is also documented at http://biopython.org/wiki/AlignIO and by +a whole chapter in our tutorial: + +* `HTML Tutorial`_ +* `PDF Tutorial`_ + +.. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html +.. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf + +Input +----- +For the typical special case when your file or handle contains one and only +one alignment, use the function Bio.AlignIO.read(). This takes an input file +handle (or in recent versions of Biopython a filename as a string), format +string and optional number of sequences per alignment. It will return a single +MultipleSeqAlignment object (or raise an exception if there isn't just one +alignment): + +>>> from Bio import AlignIO +>>> align = AlignIO.read("Phylip/interlaced.phy", "phylip") +>>> print(align) +Alignment with 3 rows and 384 columns +-----MKVILLFVLAVFTVFVSS---------------RGIPPE...I-- CYS1_DICDI +MAHARVLLLALAVLATAAVAVASSSSFADSNPIRPVTDRAASTL...VAA ALEU_HORVU +------MWATLPLLCAGAWLLGV--------PVCGAAELSVNSL...PLV CATH_HUMAN + +For the general case, when the handle could contain any number of alignments, +use the function Bio.AlignIO.parse(...) which takes the same arguments, but +returns an iterator giving MultipleSeqAlignment objects (typically used in a +for loop). If you want random access to the alignments by number, turn this +into a list: + +>>> from Bio import AlignIO +>>> alignments = list(AlignIO.parse("Emboss/needle.txt", "emboss")) +>>> print(alignments[2]) +Alignment with 2 rows and 120 columns +-KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKER...--- ref_rec +LHIVVVDDDPGTCVYIESVFAELGHTCKSFVRPEAAEEYILTHP...HKE gi|94967506|receiver + +Most alignment file formats can be concatenated so as to hold as many +different multiple sequence alignments as possible. One common example +is the output of the tool seqboot in the PHLYIP suite. Sometimes there +can be a file header and footer, as seen in the EMBOSS alignment output. + +Output +------ +Use the function Bio.AlignIO.write(...), which takes a complete set of +Alignment objects (either as a list, or an iterator), an output file handle +(or filename in recent versions of Biopython) and of course the file format:: + + from Bio import AlignIO + alignments = ... + count = SeqIO.write(alignments, "example.faa", "fasta") + +If using a handle make sure to close it to flush the data to the disk:: + + from Bio import AlignIO + alignments = ... + with open("example.faa", "w") as handle: + count = SeqIO.write(alignments, handle, "fasta") + +In general, you are expected to call this function once (with all your +alignments) and then close the file handle. However, for file formats +like PHYLIP where multiple alignments are stored sequentially (with no file +header and footer), then multiple calls to the write function should work as +expected when using handles. + +If you are using a filename, the repeated calls to the write functions will +overwrite the existing file each time. + +Conversion +---------- +The Bio.AlignIO.convert(...) function allows an easy interface for simple +alignment file format conversions. Additionally, it may use file format +specific optimisations so this should be the fastest way too. + +In general however, you can combine the Bio.AlignIO.parse(...) function with +the Bio.AlignIO.write(...) function for sequence file conversion. Using +generator expressions provides a memory efficient way to perform filtering or +other extra operations as part of the process. + +File Formats +------------ +When specifying the file format, use lowercase strings. The same format +names are also used in Bio.SeqIO and include the following: + + - clustal - Output from Clustal W or X, see also the module Bio.Clustalw + which can be used to run the command line tool from Biopython. + - emboss - EMBOSS tools' "pairs" and "simple" alignment formats. + - fasta - The generic sequence file format where each record starts with + an identifier line starting with a ">" character, followed by + lines of sequence. + - fasta-m10 - For the pairwise alignments output by Bill Pearson's FASTA + tools when used with the -m 10 command line option for machine + readable output. + - ig - The IntelliGenetics file format, apparently the same as the + MASE alignment format. + - msf - The GCG MSF alignment format, originally from PileUp tool. + - nexus - Output from NEXUS, see also the module Bio.Nexus which can also + read any phylogenetic trees in these files. + - phylip - Interlaced PHYLIP, as used by the PHYLIP tools. + - phylip-sequential - Sequential PHYLIP. + - phylip-relaxed - PHYLIP like format allowing longer names. + - stockholm - A richly annotated alignment file format used by PFAM. + - mauve - Output from progressiveMauve/Mauve + +Note that while Bio.AlignIO can read all the above file formats, it cannot +write to all of them. + +You can also use any file format supported by Bio.SeqIO, such as "fasta" or +"ig" (which are listed above), PROVIDED the sequences in your file are all the +same length. +""" +# TODO +# - define policy on reading aligned sequences with gaps in +# (e.g. - and . characters) +# +# - Can we build the to_alignment(...) functionality +# into the generic Alignment class instead? +# +# - How best to handle unique/non unique record.id when writing. +# For most file formats reading such files is fine; The stockholm +# parser would fail. +# +# - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf) +# http://www.bioperl.org/wiki/MSF_multiple_alignment_format +from Bio.Align import MultipleSeqAlignment +from Bio.File import as_handle + +from . import ClustalIO +from . import EmbossIO +from . import FastaIO +from . import MafIO +from . import MauveIO +from . import MsfIO +from . import NexusIO +from . import PhylipIO +from . import StockholmIO + +# Convention for format names is "mainname-subtype" in lower case. +# Please use the same names as BioPerl and EMBOSS where possible. + +_FormatToIterator = { # "fasta" is done via Bio.SeqIO + "clustal": ClustalIO.ClustalIterator, + "emboss": EmbossIO.EmbossIterator, + "fasta-m10": FastaIO.FastaM10Iterator, + "maf": MafIO.MafIterator, + "mauve": MauveIO.MauveIterator, + "msf": MsfIO.MsfIterator, + "nexus": NexusIO.NexusIterator, + "phylip": PhylipIO.PhylipIterator, + "phylip-sequential": PhylipIO.SequentialPhylipIterator, + "phylip-relaxed": PhylipIO.RelaxedPhylipIterator, + "stockholm": StockholmIO.StockholmIterator, +} + +_FormatToWriter = { # "fasta" is done via Bio.SeqIO + "clustal": ClustalIO.ClustalWriter, + "maf": MafIO.MafWriter, + "mauve": MauveIO.MauveWriter, + "nexus": NexusIO.NexusWriter, + "phylip": PhylipIO.PhylipWriter, + "phylip-sequential": PhylipIO.SequentialPhylipWriter, + "phylip-relaxed": PhylipIO.RelaxedPhylipWriter, + "stockholm": StockholmIO.StockholmWriter, +} + + +def write(alignments, handle, format): + """Write complete set of alignments to a file. + + Arguments: + - alignments - A list (or iterator) of MultipleSeqAlignment objects, + or a single alignment object. + - handle - File handle object to write to, or filename as string + (note older versions of Biopython only took a handle). + - format - lower case string describing the file format to write. + + You should close the handle after calling this function. + + Returns the number of alignments written (as an integer). + """ + from Bio import SeqIO + + # Try and give helpful error messages: + if not isinstance(format, str): + raise TypeError("Need a string for the file format (lower case)") + if not format: + raise ValueError("Format required (lower case string)") + if format != format.lower(): + raise ValueError("Format string '%s' should be lower case" % format) + + if isinstance(alignments, MultipleSeqAlignment): + # This raised an exception in older versions of Biopython + alignments = [alignments] + + with as_handle(handle, "w") as fp: + # Map the file format to a writer class + if format in _FormatToWriter: + writer_class = _FormatToWriter[format] + count = writer_class(fp).write_file(alignments) + elif format in SeqIO._FormatToWriter: + # Exploit the existing SeqIO parser to do the dirty work! + # TODO - Can we make one call to SeqIO.write() and count the alignments? + count = 0 + for alignment in alignments: + if not isinstance(alignment, MultipleSeqAlignment): + raise TypeError( + "Expect a list or iterator of MultipleSeqAlignment " + "objects, got: %r" % alignment + ) + SeqIO.write(alignment, fp, format) + count += 1 + elif format in _FormatToIterator or format in SeqIO._FormatToIterator: + raise ValueError( + "Reading format '%s' is supported, but not writing" % format + ) + else: + raise ValueError("Unknown format '%s'" % format) + + if not isinstance(count, int): + raise RuntimeError( + "Internal error - the underlying %s " + "writer should have returned the alignment count, not %r" % (format, count) + ) + + return count + + +# This is a generator function! +def _SeqIO_to_alignment_iterator(handle, format, seq_count=None): + """Use Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE). + + Arguments: + - handle - handle to the file. + - format - string describing the file format. + - seq_count - Optional integer, number of sequences expected in each + alignment. Recommended for fasta format files. + + If count is omitted (default) then all the sequences in the file are + combined into a single MultipleSeqAlignment. + """ + from Bio import SeqIO + + if format not in SeqIO._FormatToIterator: + raise ValueError("Unknown format '%s'" % format) + + if seq_count: + # Use the count to split the records into batches. + seq_record_iterator = SeqIO.parse(handle, format) + + records = [] + for record in seq_record_iterator: + records.append(record) + if len(records) == seq_count: + yield MultipleSeqAlignment(records) + records = [] + if records: + raise ValueError("Check seq_count argument, not enough sequences?") + else: + # Must assume that there is a single alignment using all + # the SeqRecord objects: + records = list(SeqIO.parse(handle, format)) + if records: + yield MultipleSeqAlignment(records) + + +def parse(handle, format, seq_count=None): + """Iterate over an alignment file as MultipleSeqAlignment objects. + + Arguments: + - handle - handle to the file, or the filename as a string + (note older versions of Biopython only took a handle). + - format - string describing the file format. + - seq_count - Optional integer, number of sequences expected in each + alignment. Recommended for fasta format files. + + If you have the file name in a string 'filename', use: + + >>> from Bio import AlignIO + >>> filename = "Emboss/needle.txt" + >>> format = "emboss" + >>> for alignment in AlignIO.parse(filename, format): + ... print("Alignment of length %i" % alignment.get_alignment_length()) + Alignment of length 124 + Alignment of length 119 + Alignment of length 120 + Alignment of length 118 + Alignment of length 125 + + If you have a string 'data' containing the file contents, use:: + + from Bio import AlignIO + from io import StringIO + my_iterator = AlignIO.parse(StringIO(data), format) + + Use the Bio.AlignIO.read() function when you expect a single record only. + """ + from Bio import SeqIO + + # Try and give helpful error messages: + if not isinstance(format, str): + raise TypeError("Need a string for the file format (lower case)") + if not format: + raise ValueError("Format required (lower case string)") + if format != format.lower(): + raise ValueError("Format string '%s' should be lower case" % format) + if seq_count is not None and not isinstance(seq_count, int): + raise TypeError("Need integer for seq_count (sequences per alignment)") + + with as_handle(handle) as fp: + # Map the file format to a sequence iterator: + if format in _FormatToIterator: + iterator_generator = _FormatToIterator[format] + i = iterator_generator(fp, seq_count) + + elif format in SeqIO._FormatToIterator: + # Exploit the existing SeqIO parser to the dirty work! + i = _SeqIO_to_alignment_iterator(fp, format, seq_count=seq_count) + else: + raise ValueError("Unknown format '%s'" % format) + + yield from i + + +def read(handle, format, seq_count=None): + """Turn an alignment file into a single MultipleSeqAlignment object. + + Arguments: + - handle - handle to the file, or the filename as a string + (note older versions of Biopython only took a handle). + - format - string describing the file format. + - seq_count - Optional integer, number of sequences expected in each + alignment. Recommended for fasta format files. + + If the handle contains no alignments, or more than one alignment, + an exception is raised. For example, using a PFAM/Stockholm file + containing one alignment: + + >>> from Bio import AlignIO + >>> filename = "Clustalw/protein.aln" + >>> format = "clustal" + >>> alignment = AlignIO.read(filename, format) + >>> print("Alignment of length %i" % alignment.get_alignment_length()) + Alignment of length 411 + + If however you want the first alignment from a file containing + multiple alignments this function would raise an exception. + + >>> from Bio import AlignIO + >>> filename = "Emboss/needle.txt" + >>> format = "emboss" + >>> alignment = AlignIO.read(filename, format) + Traceback (most recent call last): + ... + ValueError: More than one record found in handle + + Instead use: + + >>> from Bio import AlignIO + >>> filename = "Emboss/needle.txt" + >>> format = "emboss" + >>> alignment = next(AlignIO.parse(filename, format)) + >>> print("First alignment has length %i" % alignment.get_alignment_length()) + First alignment has length 124 + + You must use the Bio.AlignIO.parse() function if you want to read multiple + records from the handle. + """ + iterator = parse(handle, format, seq_count) + try: + alignment = next(iterator) + except StopIteration: + raise ValueError("No records found in handle") from None + try: + next(iterator) + raise ValueError("More than one record found in handle") + except StopIteration: + pass + if seq_count: + if len(alignment) != seq_count: + raise RuntimeError( + "More sequences found in alignment than specified in seq_count: %s." + % seq_count + ) + return alignment + + +def convert(in_file, in_format, out_file, out_format, molecule_type=None): + """Convert between two alignment files, returns number of alignments. + + Arguments: + - in_file - an input handle or filename + - in_format - input file format, lower case string + - output - an output handle or filename + - out_file - output file format, lower case string + - molecule_type - optional molecule type to apply, string containing + "DNA", "RNA" or "protein". + + **NOTE** - If you provide an output filename, it will be opened which will + overwrite any existing file without warning. This may happen if even the + conversion is aborted (e.g. an invalid out_format name is given). + + Some output formats require the molecule type be specified where this + cannot be determined by the parser. For example, converting to FASTA, + Clustal, or PHYLIP format to NEXUS: + + >>> from io import StringIO + >>> from Bio import AlignIO + >>> handle = StringIO() + >>> AlignIO.convert("Phylip/horses.phy", "phylip", handle, "nexus", "DNA") + 1 + >>> print(handle.getvalue()) + #NEXUS + begin data; + dimensions ntax=10 nchar=40; + format datatype=dna missing=? gap=-; + matrix + Mesohippus AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + Hypohippus AAACCCCCCCAAAAAAAAACAAAAAAAAAAAAAAAAAAAA + Archaeohip CAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAA + Parahippus CAAACAACAACAAAAAAAACAAAAAAAAAAAAAAAAAAAA + Merychippu CCAACCACCACCCCACACCCAAAAAAAAAAAAAAAAAAAA + 'M. secundu' CCAACCACCACCCACACCCCAAAAAAAAAAAAAAAAAAAA + Nannipus CCAACCACAACCCCACACCCAAAAAAAAAAAAAAAAAAAA + Neohippari CCAACCCCCCCCCCACACCCAAAAAAAAAAAAAAAAAAAA + Calippus CCAACCACAACCCACACCCCAAAAAAAAAAAAAAAAAAAA + Pliohippus CCCACCCCCCCCCACACCCCAAAAAAAAAAAAAAAAAAAA + ; + end; + + """ + if molecule_type: + if not isinstance(molecule_type, str): + raise TypeError("Molecule type should be a string, not %r" % molecule_type) + elif ( + "DNA" in molecule_type + or "RNA" in molecule_type + or "protein" in molecule_type + ): + pass + else: + raise ValueError("Unexpected molecule type, %r" % molecule_type) + + # TODO - Add optimised versions of important conversions + # For now just off load the work to SeqIO parse/write + # Don't open the output file until we've checked the input is OK: + alignments = parse(in_file, in_format, None) + + if molecule_type: + # Edit the records on the fly to set molecule type + + def over_ride(alignment): + """Over-ride molecule in-place.""" + for record in alignment: + record.annotations["molecule_type"] = molecule_type + return alignment + + alignments = (over_ride(_) for _ in alignments) + return write(alignments, out_file, out_format) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/AlignIO/__pycache__/ClustalIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/ClustalIO.cpython-37.pyc new file mode 100644 index 0000000..967a616 Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/ClustalIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/EmbossIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/EmbossIO.cpython-37.pyc new file mode 100644 index 0000000..dc69b07 Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/EmbossIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/FastaIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/FastaIO.cpython-37.pyc new file mode 100644 index 0000000..590a863 Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/FastaIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/Interfaces.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/Interfaces.cpython-37.pyc new file mode 100644 index 0000000..50cee59 Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/Interfaces.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/MafIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/MafIO.cpython-37.pyc new file mode 100644 index 0000000..6495934 Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/MafIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/MauveIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/MauveIO.cpython-37.pyc new file mode 100644 index 0000000..9a01d82 Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/MauveIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/MsfIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/MsfIO.cpython-37.pyc new file mode 100644 index 0000000..41d6c6a Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/MsfIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/NexusIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/NexusIO.cpython-37.pyc new file mode 100644 index 0000000..7de4464 Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/NexusIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/PhylipIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/PhylipIO.cpython-37.pyc new file mode 100644 index 0000000..15268e3 Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/PhylipIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/StockholmIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/StockholmIO.cpython-37.pyc new file mode 100644 index 0000000..e638dbf Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/StockholmIO.cpython-37.pyc differ diff --git a/code/lib/Bio/AlignIO/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..cf0b44e Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Alphabet/__init__.py b/code/lib/Bio/Alphabet/__init__.py new file mode 100644 index 0000000..5109136 --- /dev/null +++ b/code/lib/Bio/Alphabet/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2000-2002 by Andrew Dalke. +# Revisions copyright 2007-2010 by Peter Cock. +# All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Alphabets were previously used to declare sequence type and letters (OBSOLETE). + +The design of Bio.Aphabet included a number of historic design choices +which, with the benefit of hindsight, were regretable. Bio.Alphabet was +therefore removed from Biopython in release 1.78. Instead, the molecule type is +included as an annotation on SeqRecords where appropriate. + +Please see https://biopython.org/wiki/Alphabet for examples showing how to +transition from Bio.Alphabet to molecule type annotations. +""" + +raise ImportError( + "Bio.Alphabet has been removed from Biopython. In many cases, the alphabet can simply be ignored and removed from scripts. In a few cases, you may need to specify the ``molecule_type`` as an annotation on a SeqRecord for your script to work correctly. Please see https://biopython.org/wiki/Alphabet for more information." +) diff --git a/code/lib/Bio/Alphabet/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Alphabet/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..9ee4b01 Binary files /dev/null and b/code/lib/Bio/Alphabet/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Application/__init__.py b/code/lib/Bio/Application/__init__.py new file mode 100644 index 0000000..f844d27 --- /dev/null +++ b/code/lib/Bio/Application/__init__.py @@ -0,0 +1,838 @@ +# Copyright 2001-2004 Brad Chapman. +# Revisions copyright 2009-2013 by Peter Cock. +# All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""General mechanisms to access applications in Biopython (OBSOLETE). + +This module is not intended for direct use. It provides the basic objects which +are subclassed by our command line wrappers, such as: + + - Bio.Align.Applications + - Bio.Blast.Applications + - Bio.Emboss.Applications + - Bio.Sequencing.Applications + +These modules provide wrapper classes for command line tools to help you +construct command line strings by setting the values of each parameter. +The finished command line strings are then normally invoked via the built-in +Python module subprocess. + +Due to the on going maintainance burden or keeping command line application +wrappers up to date, we have decided to deprecate and eventually remove them. +We instead now recommend building your command line and invoking it directly +with the subprocess module. +""" +import os +import platform +import sys +import subprocess +import re + + +# Use this regular expression to test the property names are going to +# be valid as Python properties or arguments +_re_prop_name = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]*$") +assert _re_prop_name.match("t") +assert _re_prop_name.match("test") +assert _re_prop_name.match("_test") is None # we don't want private names +assert _re_prop_name.match("-test") is None +assert _re_prop_name.match("any-hyphen") is None +assert _re_prop_name.match("underscore_ok") +assert _re_prop_name.match("test_name") +assert _re_prop_name.match("test2") +# These are reserved names in Python itself, +_reserved_names = [ + "and", + "del", + "from", + "not", + "while", + "as", + "elif", + "global", + "or", + "with", + "assert", + "else", + "if", + "pass", + "yield", + "break", + "except", + "import", + "print", + "class", + "exec", + "in", + "raise", + "continue", + "finally", + "is", + "return", + "def", + "for", + "lambda", + "try", +] +# These are reserved names due to the way the wrappers work +_local_reserved_names = ["set_parameter"] + + +class ApplicationError(subprocess.CalledProcessError): + """Raised when an application returns a non-zero exit status (OBSOLETE). + + The exit status will be stored in the returncode attribute, similarly + the command line string used in the cmd attribute, and (if captured) + stdout and stderr as strings. + + This exception is a subclass of subprocess.CalledProcessError. + + >>> err = ApplicationError(-11, "helloworld", "", "Some error text") + >>> err.returncode, err.cmd, err.stdout, err.stderr + (-11, 'helloworld', '', 'Some error text') + >>> print(err) + Non-zero return code -11 from 'helloworld', message 'Some error text' + + """ + + def __init__(self, returncode, cmd, stdout="", stderr=""): + """Initialize the class.""" + self.returncode = returncode + self.cmd = cmd + self.stdout = stdout + self.stderr = stderr + + def __str__(self): + """Format the error as a string.""" + # get first line of any stderr message + try: + msg = self.stderr.lstrip().split("\n", 1)[0].rstrip() + except Exception: # TODO, ValueError? AttributeError? + msg = "" + if msg: + return "Non-zero return code %d from %r, message %r" % ( + self.returncode, + self.cmd, + msg, + ) + else: + return "Non-zero return code %d from %r" % (self.returncode, self.cmd) + + def __repr__(self): + """Represent the error as a string.""" + return "ApplicationError(%i, %s, %s, %s)" % ( + self.returncode, + self.cmd, + self.stdout, + self.stderr, + ) + + +class AbstractCommandline: + r"""Generic interface for constructing command line strings (OBSOLETE). + + This class shouldn't be called directly; it should be subclassed to + provide an implementation for a specific application. + + For a usage example we'll show one of the EMBOSS wrappers. You can set + options when creating the wrapper object using keyword arguments - or + later using their corresponding properties: + + >>> from Bio.Emboss.Applications import WaterCommandline + >>> cline = WaterCommandline(gapopen=10, gapextend=0.5) + >>> cline + WaterCommandline(cmd='water', gapopen=10, gapextend=0.5) + + You can instead manipulate the parameters via their properties, e.g. + + >>> cline.gapopen + 10 + >>> cline.gapopen = 20 + >>> cline + WaterCommandline(cmd='water', gapopen=20, gapextend=0.5) + + You can clear a parameter you have already added by 'deleting' the + corresponding property: + + >>> del cline.gapopen + >>> cline.gapopen + >>> cline + WaterCommandline(cmd='water', gapextend=0.5) + + Once you have set the parameters you need, you can turn the object into + a string (e.g. to log the command): + + >>> str(cline) + Traceback (most recent call last): + ... + ValueError: You must either set outfile (output filename), or enable filter or stdout (output to stdout). + + In this case the wrapper knows certain arguments are required to construct + a valid command line for the tool. For a complete example, + + >>> from Bio.Emboss.Applications import WaterCommandline + >>> water_cmd = WaterCommandline(gapopen=10, gapextend=0.5) + >>> water_cmd.asequence = "asis:ACCCGGGCGCGGT" + >>> water_cmd.bsequence = "asis:ACCCGAGCGCGGT" + >>> water_cmd.outfile = "temp_water.txt" + >>> print(water_cmd) + water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5 + >>> water_cmd + WaterCommandline(cmd='water', outfile='temp_water.txt', asequence='asis:ACCCGGGCGCGGT', bsequence='asis:ACCCGAGCGCGGT', gapopen=10, gapextend=0.5) + + You would typically run the command line via a standard Python operating + system call using the subprocess module for full control. For the simple + case where you just want to run the command and get the output: + + stdout, stderr = water_cmd() + + Note that by default we assume the underlying tool is installed on the + system $PATH environment variable. This is normal under Linux/Unix, but + may need to be done manually under Windows. Alternatively, you can specify + the full path to the binary as the first argument (cmd): + + >>> from Bio.Emboss.Applications import WaterCommandline + >>> water_cmd = WaterCommandline(r"C:\Program Files\EMBOSS\water.exe", + ... gapopen=10, gapextend=0.5, + ... asequence="asis:ACCCGGGCGCGGT", + ... bsequence="asis:ACCCGAGCGCGGT", + ... outfile="temp_water.txt") + >>> print(water_cmd) + "C:\Program Files\EMBOSS\water.exe" -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5 + + Notice that since the path name includes a space it has automatically + been quoted. + + """ + + # TODO - Replace the above example since EMBOSS doesn't work properly + # if installed into a folder with a space like "C:\Program Files\EMBOSS" + # + # Note the call example above is not a doctest as we can't handle EMBOSS + # (or any other tool) being missing in the unit tests. + + parameters = None # will be a list defined in subclasses + + def __init__(self, cmd, **kwargs): + """Create a new instance of a command line wrapper object.""" + # Init method - should be subclassed! + # + # The subclass methods should look like this: + # + # def __init__(self, cmd="muscle", **kwargs): + # self.parameters = [...] + # AbstractCommandline.__init__(self, cmd, **kwargs) + # + # i.e. There should have an optional argument "cmd" to set the location + # of the executable (with a sensible default which should work if the + # command is on the path on Unix), and keyword arguments. It should + # then define a list of parameters, all objects derived from the base + # class _AbstractParameter. + # + # The keyword arguments should be any valid parameter name, and will + # be used to set the associated parameter. + self.program_name = cmd + try: + parameters = self.parameters + except AttributeError: + raise AttributeError( + "Subclass should have defined self.parameters" + ) from None + # Create properties for each parameter at run time + aliases = set() + for p in parameters: + if not p.names: + if not isinstance(p, _StaticArgument): + raise TypeError("Expected %r to be of type _StaticArgument" % p) + continue + for name in p.names: + if name in aliases: + raise ValueError("Parameter alias %s multiply defined" % name) + aliases.add(name) + name = p.names[-1] + if _re_prop_name.match(name) is None: + raise ValueError( + "Final parameter name %r cannot be used as " + "an argument or property name in python" % name + ) + if name in _reserved_names: + raise ValueError( + "Final parameter name %r cannot be used as " + "an argument or property name because it is " + "a reserved word in python" % name + ) + if name in _local_reserved_names: + raise ValueError( + "Final parameter name %r cannot be used as " + "an argument or property name due to the " + "way the AbstractCommandline class works" % name + ) + + # Beware of binding-versus-assignment confusion issues + def getter(name): + return lambda x: x._get_parameter(name) + + def setter(name): + return lambda x, value: x.set_parameter(name, value) + + def deleter(name): + return lambda x: x._clear_parameter(name) + + doc = p.description + if isinstance(p, _Switch): + doc += ( + "\n\nThis property controls the addition of the %s " + "switch, treat this property as a boolean." % p.names[0] + ) + else: + doc += ( + "\n\nThis controls the addition of the %s parameter " + "and its associated value. Set this property to the " + "argument value required." % p.names[0] + ) + prop = property(getter(name), setter(name), deleter(name), doc) + setattr(self.__class__, name, prop) # magic! + for key, value in kwargs.items(): + self.set_parameter(key, value) + + def _validate(self): + """Make sure the required parameters have been set (PRIVATE). + + No return value - it either works or raises a ValueError. + + This is a separate method (called from __str__) so that subclasses may + override it. + """ + for p in self.parameters: + # Check for missing required parameters: + if p.is_required and not (p.is_set): + raise ValueError("Parameter %s is not set." % p.names[-1]) + # Also repeat the parameter validation here, just in case? + + def __str__(self): + """Make the commandline string with the currently set options. + + e.g. + + >>> from Bio.Emboss.Applications import WaterCommandline + >>> cline = WaterCommandline(gapopen=10, gapextend=0.5) + >>> cline.asequence = "asis:ACCCGGGCGCGGT" + >>> cline.bsequence = "asis:ACCCGAGCGCGGT" + >>> cline.outfile = "temp_water.txt" + >>> print(cline) + water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5 + >>> str(cline) + 'water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5' + """ + self._validate() + commandline = "%s " % _escape_filename(self.program_name) + for parameter in self.parameters: + if parameter.is_set: + # This will include a trailing space: + commandline += str(parameter) + return commandline.strip() # remove trailing space + + def __repr__(self): + """Return a representation of the command line object for debugging. + + e.g. + + >>> from Bio.Emboss.Applications import WaterCommandline + >>> cline = WaterCommandline(gapopen=10, gapextend=0.5) + >>> cline.asequence = "asis:ACCCGGGCGCGGT" + >>> cline.bsequence = "asis:ACCCGAGCGCGGT" + >>> cline.outfile = "temp_water.txt" + >>> print(cline) + water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5 + >>> cline + WaterCommandline(cmd='water', outfile='temp_water.txt', asequence='asis:ACCCGGGCGCGGT', bsequence='asis:ACCCGAGCGCGGT', gapopen=10, gapextend=0.5) + """ + answer = "%s(cmd=%r" % (self.__class__.__name__, self.program_name) + for parameter in self.parameters: + if parameter.is_set: + if isinstance(parameter, _Switch): + answer += ", %s=True" % parameter.names[-1] + else: + answer += ", %s=%r" % (parameter.names[-1], parameter.value) + answer += ")" + return answer + + def _get_parameter(self, name): + """Get a commandline option value (PRIVATE).""" + for parameter in self.parameters: + if name in parameter.names: + if isinstance(parameter, _Switch): + return parameter.is_set + else: + return parameter.value + raise ValueError("Option name %s was not found." % name) + + def _clear_parameter(self, name): + """Reset or clear a commandline option value (PRIVATE).""" + cleared_option = False + for parameter in self.parameters: + if name in parameter.names: + parameter.value = None + parameter.is_set = False + cleared_option = True + if not cleared_option: + raise ValueError("Option name %s was not found." % name) + + def set_parameter(self, name, value=None): + """Set a commandline option for a program (OBSOLETE). + + Every parameter is available via a property and as a named + keyword when creating the instance. Using either of these is + preferred to this legacy set_parameter method which is now + OBSOLETE, and likely to be DEPRECATED and later REMOVED in + future releases. + """ + set_option = False + for parameter in self.parameters: + if name in parameter.names: + if isinstance(parameter, _Switch): + if value is None: + import warnings + + warnings.warn( + "For a switch type argument like %s, " + "we expect a boolean. None is treated " + "as FALSE!" % parameter.names[-1] + ) + parameter.is_set = bool(value) + set_option = True + else: + if value is not None: + self._check_value(value, name, parameter.checker_function) + parameter.value = value + parameter.is_set = True + set_option = True + if not set_option: + raise ValueError("Option name %s was not found." % name) + + def _check_value(self, value, name, check_function): + """Check whether the given value is valid (PRIVATE). + + No return value - it either works or raises a ValueError. + + This uses the passed function 'check_function', which can either + return a [0, 1] (bad, good) value or raise an error. Either way + this function will raise an error if the value is not valid, or + finish silently otherwise. + """ + if check_function is not None: + is_good = check_function(value) # May raise an exception + if is_good not in [0, 1, True, False]: + raise ValueError( + "Result of check_function: %r is of an unexpected value" % is_good + ) + if not is_good: + raise ValueError( + "Invalid parameter value %r for parameter %s" % (value, name) + ) + + def __setattr__(self, name, value): + """Set attribute name to value (PRIVATE). + + This code implements a workaround for a user interface issue. + Without this __setattr__ attribute-based assignment of parameters + will silently accept invalid parameters, leading to known instances + of the user assuming that parameters for the application are set, + when they are not. + + >>> from Bio.Emboss.Applications import WaterCommandline + >>> cline = WaterCommandline(gapopen=10, gapextend=0.5, stdout=True) + >>> cline.asequence = "a.fasta" + >>> cline.bsequence = "b.fasta" + >>> cline.csequence = "c.fasta" + Traceback (most recent call last): + ... + ValueError: Option name csequence was not found. + >>> print(cline) + water -stdout -asequence=a.fasta -bsequence=b.fasta -gapopen=10 -gapextend=0.5 + + This workaround uses a whitelist of object attributes, and sets the + object attribute list as normal, for these. Other attributes are + assumed to be parameters, and passed to the self.set_parameter method + for validation and assignment. + """ + if name in ["parameters", "program_name"]: # Allowed attributes + self.__dict__[name] = value + else: + self.set_parameter(name, value) # treat as a parameter + + def __call__(self, stdin=None, stdout=True, stderr=True, cwd=None, env=None): + """Execute command, wait for it to finish, return (stdout, stderr). + + Runs the command line tool and waits for it to finish. If it returns + a non-zero error level, an exception is raised. Otherwise two strings + are returned containing stdout and stderr. + + The optional stdin argument should be a string of data which will be + passed to the tool as standard input. + + The optional stdout and stderr argument may be filenames (string), + but otherwise are treated as a booleans, and control if the output + should be captured as strings (True, default), or ignored by sending + it to /dev/null to avoid wasting memory (False). If sent to a file + or ignored, then empty string(s) are returned. + + The optional cwd argument is a string giving the working directory + to run the command from. See Python's subprocess module documentation + for more details. + + The optional env argument is a dictionary setting the environment + variables to be used in the new process. By default the current + process' environment variables are used. See Python's subprocess + module documentation for more details. + + Default example usage:: + + from Bio.Emboss.Applications import WaterCommandline + water_cmd = WaterCommandline(gapopen=10, gapextend=0.5, + stdout=True, auto=True, + asequence="a.fasta", bsequence="b.fasta") + print("About to run: %s" % water_cmd) + std_output, err_output = water_cmd() + + This functionality is similar to subprocess.check_output(). In general + if you require more control over running the command, use subprocess + directly. + + When the program called returns a non-zero error level, a custom + ApplicationError exception is raised. This includes any stdout and + stderr strings captured as attributes of the exception object, since + they may be useful for diagnosing what went wrong. + """ + if not stdout: + stdout_arg = open(os.devnull, "w") + elif isinstance(stdout, str): + stdout_arg = open(stdout, "w") + else: + stdout_arg = subprocess.PIPE + + if not stderr: + stderr_arg = open(os.devnull, "w") + elif isinstance(stderr, str): + if stdout == stderr: + stderr_arg = stdout_arg # Write both to the same file + else: + stderr_arg = open(stderr, "w") + else: + stderr_arg = subprocess.PIPE + + # We may not need to supply any piped input, but we setup the + # standard input pipe anyway as a work around for a python + # bug if this is called from a Windows GUI program. For + # details, see http://bugs.python.org/issue1124861 + # + # Using universal newlines is important on Python 3, this + # gives unicode handles rather than bytes handles. + + # Windows 7, 8, 8.1 and 10 want shell = True + if sys.platform != "win32": + use_shell = True + else: + win_ver = platform.win32_ver()[0] + if win_ver in ["7", "8", "post2012Server", "10"]: + use_shell = True + else: + use_shell = False + child_process = subprocess.Popen( + str(self), + stdin=subprocess.PIPE, + stdout=stdout_arg, + stderr=stderr_arg, + universal_newlines=True, + cwd=cwd, + env=env, + shell=use_shell, + ) + # Use .communicate as can get deadlocks with .wait(), see Bug 2804 + stdout_str, stderr_str = child_process.communicate(stdin) + if not stdout: + assert not stdout_str, stdout_str + if not stderr: + assert not stderr_str, stderr_str + return_code = child_process.returncode + + # Particularly important to close handles on Jython and PyPy + # (where garbage collection is less predictable) and on Windows + # (where cannot delete files with an open handle): + if not stdout or isinstance(stdout, str): + # We opened /dev/null or a file + stdout_arg.close() + if not stderr or (isinstance(stderr, str) and stdout != stderr): + # We opened /dev/null or a file + stderr_arg.close() + + if return_code: + raise ApplicationError(return_code, str(self), stdout_str, stderr_str) + return stdout_str, stderr_str + + +class _AbstractParameter: + """A class to hold information about a parameter for a commandline. + + Do not use this directly, instead use one of the subclasses. + """ + + def __init__(self): + raise NotImplementedError + + def __str__(self): + raise NotImplementedError + + +class _Option(_AbstractParameter): + """Represent an option that can be set for a program. + + This holds UNIXish options like --append=yes and -a yes, + where a value (here "yes") is generally expected. + + For UNIXish options like -kimura in clustalw which don't + take a value, use the _Switch object instead. + + Attributes: + - names -- a list of string names (typically two entries) by which + the parameter can be set via the legacy set_parameter method + (eg ["-a", "--append", "append"]). The first name in list is used + when building the command line. The last name in the list is a + "human readable" name describing the option in one word. This + must be a valid Python identifier as it is used as the property + name and as a keyword argument, and should therefore follow PEP8 + naming. + - description -- a description of the option. This is used as + the property docstring. + - filename -- True if this argument is a filename (or other argument + that should be quoted) and should be automatically quoted if it + contains spaces. + - checker_function -- a reference to a function that will determine + if a given value is valid for this parameter. This function can either + raise an error when given a bad value, or return a [0, 1] decision on + whether the value is correct. + - equate -- should an equals sign be inserted if a value is used? + - is_required -- a flag to indicate if the parameter must be set for + the program to be run. + - is_set -- if the parameter has been set + - value -- the value of a parameter + + """ + + def __init__( + self, + names, + description, + filename=False, + checker_function=None, + is_required=False, + equate=True, + ): + self.names = names + if not isinstance(description, str): + raise TypeError("Should be a string: %r for %s" % (description, names[-1])) + # Note 'filename' is for any string with spaces that needs quoting + self.is_filename = filename + self.checker_function = checker_function + self.description = description + self.equate = equate + self.is_required = is_required + + self.is_set = False + self.value = None + + def __str__(self): + """Return the value of this option for the commandline. + + Includes a trailing space. + """ + # Note: Before equate was handled explicitly, the old + # code would do either "--name " or "--name=value ", + # or " -name " or " -name value ". This choice is now + # now made explicitly when setting up the option. + if self.value is None: + return "%s " % self.names[0] + if self.is_filename: + v = _escape_filename(self.value) + else: + v = str(self.value) + if self.equate: + return "%s=%s " % (self.names[0], v) + else: + return "%s %s " % (self.names[0], v) + + +class _Switch(_AbstractParameter): + """Represent an optional argument switch for a program. + + This holds UNIXish options like -kimura in clustalw which don't + take a value, they are either included in the command string + or omitted. + + Attributes: + - names -- a list of string names (typically two entries) by which + the parameter can be set via the legacy set_parameter method + (eg ["-a", "--append", "append"]). The first name in list is used + when building the command line. The last name in the list is a + "human readable" name describing the option in one word. This + must be a valid Python identifier as it is used as the property + name and as a keyword argument, and should therefore follow PEP8 + naming. + - description -- a description of the option. This is used as + the property docstring. + - is_set -- if the parameter has been set + + NOTE - There is no value attribute, see is_set instead, + + """ + + def __init__(self, names, description): + self.names = names + self.description = description + self.is_set = False + self.is_required = False + + def __str__(self): + """Return the value of this option for the commandline. + + Includes a trailing space. + """ + assert not hasattr(self, "value") + if self.is_set: + return "%s " % self.names[0] + else: + return "" + + +class _Argument(_AbstractParameter): + """Represent an argument on a commandline. + + The names argument should be a list containing one string. + This must be a valid Python identifier as it is used as the + property name and as a keyword argument, and should therefore + follow PEP8 naming. + """ + + def __init__( + self, + names, + description, + filename=False, + checker_function=None, + is_required=False, + ): + # if len(names) != 1: + # raise ValueError("The names argument to _Argument should be a " + # "single entry list with a PEP8 property name.") + self.names = names + if not isinstance(description, str): + raise TypeError("Should be a string: %r for %s" % (description, names[-1])) + # Note 'filename' is for any string with spaces that needs quoting + self.is_filename = filename + self.checker_function = checker_function + self.description = description + self.is_required = is_required + self.is_set = False + self.value = None + + def __str__(self): + if self.value is None: + return " " + elif self.is_filename: + return "%s " % _escape_filename(self.value) + else: + return "%s " % self.value + + +class _ArgumentList(_Argument): + """Represent a variable list of arguments on a command line, e.g. multiple filenames.""" + + # TODO - Option to require at least one value? e.g. min/max count? + + def __str__(self): + if not isinstance(self.value, list): + raise TypeError("Arguments should be a list") + if not self.value: + raise ValueError("Requires at least one filename") + # A trailing space is required so that parameters following the last filename + # do not appear merged. + # e.g.: samtools cat in1.bam in2.bam-o out.sam [without trailing space][Incorrect] + # samtools cat in1.bam in2.bam -o out.sam [with trailing space][Correct] + if self.is_filename: + return " ".join(_escape_filename(v) for v in self.value) + " " + else: + return " ".join(self.value) + " " + + +class _StaticArgument(_AbstractParameter): + """Represent a static (read only) argument on a commandline. + + This is not intended to be exposed as a named argument or + property of a command line wrapper object. + """ + + def __init__(self, value): + self.names = [] + self.is_required = False + self.is_set = True + self.value = value + + def __str__(self): + return "%s " % self.value + + +def _escape_filename(filename): + """Escape filenames with spaces by adding quotes (PRIVATE). + + Note this will not add quotes if they are already included: + + >>> print((_escape_filename('example with spaces'))) + "example with spaces" + >>> print((_escape_filename('"example with spaces"'))) + "example with spaces" + >>> print((_escape_filename(1))) + 1 + + Note the function is more generic than the name suggests, since it + is used to add quotes around any string arguments containing spaces. + """ + # Is adding the following helpful + # if os.path.isfile(filename): + # # On Windows, if the file exists, we can ask for + # # its alternative short name (DOS style 8.3 format) + # # which has no spaces in it. Note that this name + # # is not portable between machines, or even folder! + # try: + # import win32api + # short = win32api.GetShortPathName(filename) + # assert os.path.isfile(short) + # return short + # except ImportError: + # pass + if not isinstance(filename, str): + # for example the NCBI BLAST+ -outfmt argument can be an integer + return filename + if " " not in filename: + return filename + # We'll just quote it - works on Windows, Mac OS X etc + if filename.startswith('"') and filename.endswith('"'): + # Its already quoted + return filename + else: + return '"%s"' % filename + + +def _test(): + """Run the Bio.Application module's doctests (PRIVATE).""" + import doctest + + doctest.testmod(verbose=1) + + +if __name__ == "__main__": + # Run the doctests + _test() diff --git a/code/lib/Bio/Application/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Application/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..ee141eb Binary files /dev/null and b/code/lib/Bio/Application/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Blast/Applications.py b/code/lib/Bio/Blast/Applications.py new file mode 100644 index 0000000..954a254 --- /dev/null +++ b/code/lib/Bio/Blast/Applications.py @@ -0,0 +1,1602 @@ +# Copyright 2001 Brad Chapman. +# Revisions copyright 2009-2010 by Peter Cock. +# Revisions copyright 2010 by Phillip Garland. +# All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Definitions for interacting with BLAST related applications (OBSOLETE). + +Wrappers for the new NCBI BLAST+ tools (written in C++): + + - NcbiblastpCommandline - Protein-Protein BLAST + - NcbiblastnCommandline - Nucleotide-Nucleotide BLAST + - NcbiblastxCommandline - Translated Query-Protein Subject BLAST + - NcbitblastnCommandline - Protein Query-Translated Subject BLAST + - NcbitblastxCommandline - Translated Query-Protein Subject BLAST + - NcbipsiblastCommandline - Position-Specific Initiated BLAST + - NcbirpsblastCommandline - Reverse Position Specific BLAST + - NcbirpstblastnCommandline - Translated Reverse Position Specific BLAST + - NcbideltablastCommandline - Protein-Protein domain enhanced lookup time accelerated blast + - NcbiblastformatterCommandline - Convert ASN.1 to other BLAST output formats + - NcbimakeblastdbCommandline - Application to create BLAST databases + +For further details, see: + +Camacho et al. BLAST+: architecture and applications +BMC Bioinformatics 2009, 10:421 +https://doi.org/10.1186/1471-2105-10-421 + +We have decided to remove this module in future, and instead recommend +building your command and invoking it via the subprocess module directly. +""" + +from Bio.Application import _Option, AbstractCommandline, _Switch + + +class _NcbibaseblastCommandline(AbstractCommandline): + """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). + + This is provided for subclassing, it deals with shared options + common to all the BLAST tools (blastn, rpsblast, rpsblast, etc + AND blast_formatter). + """ + + def __init__(self, cmd=None, **kwargs): + assert cmd is not None + extra_parameters = [ + # Core: + _Switch( + ["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments." + ), + _Switch( + ["-help", "help"], + "Print USAGE, DESCRIPTION and ARGUMENTS description; " + "ignore other arguments.", + ), + _Switch( + ["-version", "version"], + "Print version number; ignore other arguments.", + ), + # Output configuration options + _Option( + ["-out", "out"], + "Output file for alignment.", + filename=True, + equate=False, + ), + # Formatting options: + _Option( + ["-outfmt", "outfmt"], + "Alignment view. Typically an integer 0-14 but for some " + "formats can be named columns like '6 qseqid sseqid'. " + "Use 5 for XML output (differs from classic BLAST which " + "used 7 for XML).", + filename=True, # to ensure spaced inputs are quoted + equate=False, + ), + # TODO - Document and test the column options + _Switch(["-show_gis", "show_gis"], "Show NCBI GIs in deflines?"), + _Option( + ["-num_descriptions", "num_descriptions"], + "Number of database sequences to show one-line descriptions for.\n\n" + "Integer argument (at least zero). Default is 500. " + "See also num_alignments.", + equate=False, + ), + _Option( + ["-num_alignments", "num_alignments"], + "Number of database sequences to show num_alignments for.\n\n" + "Integer argument (at least zero). Default is 200. " + "See also num_alignments.", + equate=False, + ), + _Option( + ["-line_length", "line_length"], + "Line length for formatting alignments " + "(integer, at least 1, default 60).\n\n" + "Not applicable for outfmt > 4. Added in BLAST+ 2.2.30.", + equate=False, + ), + _Switch( + ["-html", "html"], "Produce HTML output? See also the outfmt option." + ), + # Miscellaneous options + _Switch( + ["-parse_deflines", "parse_deflines"], + "Should the query and subject defline(s) be parsed?", + ), + ] + try: + # Insert extra parameters - at the start just in case there + # are any arguments which must come last: + self.parameters = extra_parameters + self.parameters + except AttributeError: + # Should we raise an error? The subclass should have set this up! + self.parameters = extra_parameters + AbstractCommandline.__init__(self, cmd, **kwargs) + + def _validate_incompatibilities(self, incompatibles): + """Validate parameters for incompatibilities (PRIVATE). + + Used by the _validate method. + """ + for a in incompatibles: + if self._get_parameter(a): + for b in incompatibles[a]: + if self._get_parameter(b): + raise ValueError("Options %s and %s are incompatible." % (a, b)) + + +class _NcbiblastCommandline(_NcbibaseblastCommandline): + """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). + + This is provided for subclassing, it deals with shared options + common to all the BLAST tools (blastn, rpsblast, rpsblast, etc). + """ + + def __init__(self, cmd=None, **kwargs): + assert cmd is not None + extra_parameters = [ + # Input query options: + _Option( + ["-query", "query"], + "The sequence to search with.", + filename=True, + equate=False, + ), # Should this be required? + _Option( + ["-query_loc", "query_loc"], + "Location on the query sequence (Format: start-stop).", + equate=False, + ), + # General search options: + _Option(["-db", "db"], "The database to BLAST against.", equate=False), + _Option(["-evalue", "evalue"], "Expectation value cutoff.", equate=False), + _Option( + ["-word_size", "word_size"], + "Word size for wordfinder algorithm.\n\nInteger. Minimum 2.", + equate=False, + ), + # BLAST-2-Sequences options: + # - see subclass + # Formatting options: + # - see baseclass + # Query filtering options + _Option( + ["-soft_masking", "soft_masking"], + "Apply filtering locations as soft masks (Boolean, Default = true).", + equate=False, + ), + _Switch( + ["-lcase_masking", "lcase_masking"], + "Use lower case filtering in query and subject sequence(s)?", + ), + # Restrict search or results + _Option( + ["-gilist", "gilist"], + "Restrict search of database to list of GI's.\n\n" + "Incompatible with: negative_gilist, seqidlist, negative_seqidlist, " + "remote, subject, subject_loc", + filename=True, + equate=False, + ), + _Option( + ["-negative_gilist", "negative_gilist"], + "Restrict search of database to everything except the listed GIs.\n\n" + "Incompatible with: gilist, seqidlist, remote, subject, subject_loc", + filename=True, + equate=False, + ), + _Option( + ["-seqidlist", "seqidlist"], + "Restrict search of database to list of SeqID's.\n\n" + "Incompatible with: gilist, negative_gilist, remote, subject, " + "subject_loc", + filename=True, + equate=False, + ), + _Option( + ["-negative_seqidlist", "negative_seqidlist"], + "Restrict search of database to everything except listed SeqID's.\n\n" + "Incompatible with: gilist, seqidlist, remote, subject, subject_loc", + filename=True, + equate=False, + ), + _Option( + ["-entrez_query", "entrez_query"], + "Restrict search with the given Entrez query (requires remote).", + equate=False, + ), + _Option( + ["-qcov_hsp_perc", "qcov_hsp_perc"], + "Percent query coverage per hsp (float, 0 to 100).\n\n" + "Added in BLAST+ 2.2.30.", + equate=False, + ), + _Option( + ["-max_target_seqs", "max_target_seqs"], + "Maximum number of aligned sequences to keep (integer, at least one).", + equate=False, + ), + # Statistical options + _Option( + ["-dbsize", "dbsize"], + "Effective length of the database (integer).", + equate=False, + ), + _Option( + ["-searchsp", "searchsp"], + "Effective length of the search space (integer).", + equate=False, + ), + _Option( + ["-max_hsps_per_subject", "max_hsps_per_subject"], + "Override max number of HSPs per subject saved for ungapped searches " + "(integer).", + equate=False, + ), + _Option( + ["-max_hsps", "max_hsps"], + "Set max number of HSPs saved per subject sequence\n\n" + "Ddefault 0 means no limit.", + equate=False, + ), + _Switch(["-sum_statistics", "sum_statistics"], "Use sum statistics."), + # Is -sum_stats a BLAST+ bug, why not use -sum_statistics switch? + _Option( + ["-sum_stats", "sum_stats"], + "Use sum statistics (boolean).\n\nAdded in BLAST+ 2.2.30.", + equate=False, + ), + # Extension options + _Option( + ["-xdrop_ungap", "xdrop_ungap"], + "X-dropoff value (in bits) for ungapped extensions (float).", + equate=False, + ), + _Option( + ["-xdrop_gap", "xdrop_gap"], + "X-dropoff value (in bits) for preliminary gapped extensions (float).", + equate=False, + ), + _Option( + ["-xdrop_gap_final", "xdrop_gap_final"], + "X-dropoff value (in bits) for final gapped alignment (float).", + equate=False, + ), + _Option( + ["-window_size", "window_size"], + "Multiple hits window size, use 0 to specify 1-hit algorithm " + "(integer).", + equate=False, + ), + # Search strategy options + _Option( + ["-import_search_strategy", "import_search_strategy"], + "Search strategy to use.\n\n" + "Incompatible with: export_search_strategy", + filename=True, + equate=False, + ), + _Option( + ["-export_search_strategy", "export_search_strategy"], + "File name to record the search strategy used.\n\n" + "Incompatible with: import_search_strategy", + filename=True, + equate=False, + ), + # Miscellaneous options + _Option( + ["-num_threads", "num_threads"], + "Number of threads to use in the BLAST search.\n\n" + "Integer, at least one. Default is one. Incompatible with: remote", + equate=False, + ), + _Switch( + ["-remote", "remote"], + "Execute search remotely?\n\n" + "Incompatible with: gilist, negative_gilist, subject_loc, " + "num_threads, ...", + ), + ] + try: + # Insert extra parameters - at the start just in case there + # are any arguments which must come last: + self.parameters = extra_parameters + self.parameters + except AttributeError: + # Should we raise an error? The subclass should have set this up! + self.parameters = extra_parameters + _NcbibaseblastCommandline.__init__(self, cmd, **kwargs) + + def _validate(self): + incompatibles = { + "remote": ["gilist", "negative_gilist", "num_threads"], + "import_search_strategy": ["export_search_strategy"], + "gilist": ["negative_gilist"], + "seqidlist": ["gilist", "negative_gilist", "remote"], + } + self._validate_incompatibilities(incompatibles) + if self.entrez_query and not self.remote: + raise ValueError("Option entrez_query requires remote option.") + AbstractCommandline._validate(self) + + +class _Ncbiblast2SeqCommandline(_NcbiblastCommandline): + """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). + + This is provided for subclassing, it deals with shared options + common to all the BLAST tools supporting two-sequence BLAST + (blastn, psiblast, etc) but not rpsblast or rpstblastn. + """ + + def __init__(self, cmd=None, **kwargs): + assert cmd is not None + extra_parameters = [ + # General search options: + _Option( + ["-gapopen", "gapopen"], "Cost to open a gap (integer).", equate=False + ), + _Option( + ["-gapextend", "gapextend"], + "Cost to extend a gap (integer).", + equate=False, + ), + # BLAST-2-Sequences options: + _Option( + ["-subject", "subject"], + "Subject sequence(s) to search.\n\n" + "Incompatible with: db, gilist, seqidlist, negative_gilist, " + "negative_seqidlist, db_soft_mask, db_hard_mask\n\n" + "See also subject_loc.", + filename=True, + equate=False, + ), + _Option( + ["-subject_loc", "subject_loc"], + "Location on the subject sequence (Format: start-stop).\n\n" + "Incompatible with: db, gilist, seqidlist, negative_gilist, " + "negative_seqidlist, db_soft_mask, db_hard_mask, remote.\n\n" + "See also subject.", + equate=False, + ), + # Restrict search or results: + _Option( + ["-culling_limit", "culling_limit"], + "Hit culling limit (integer).\n\n" + "If the query range of a hit is enveloped by that of at " + "least this many higher-scoring hits, delete the hit.\n\n" + "Incompatible with: best_hit_overhang, best_hit_score_edge.", + equate=False, + ), + _Option( + ["-best_hit_overhang", "best_hit_overhang"], + "Best Hit algorithm overhang value (float, recommended value: 0.1)\n\n" + "Float between 0.0 and 0.5 inclusive. " + "Incompatible with: culling_limit.", + equate=False, + ), + _Option( + ["-best_hit_score_edge", "best_hit_score_edge"], + "Best Hit algorithm score edge value (float).\n\n" + "Float between 0.0 and 0.5 inclusive. Recommended value: 0.1\n\n" + "Incompatible with: culling_limit.", + equate=False, + ), + ] + try: + # Insert extra parameters - at the start just in case there + # are any arguments which must come last: + self.parameters = extra_parameters + self.parameters + except AttributeError: + # Should we raise an error? The subclass should have set this up! + self.parameters = extra_parameters + _NcbiblastCommandline.__init__(self, cmd, **kwargs) + + def _validate(self): + incompatibles = { + "subject_loc": ["db", "gilist", "negative_gilist", "seqidlist", "remote"], + "culling_limit": ["best_hit_overhang", "best_hit_score_edge"], + "subject": ["db", "gilist", "negative_gilist", "seqidlist"], + } + self._validate_incompatibilities(incompatibles) + _NcbiblastCommandline._validate(self) + + +class _NcbiblastMain2SeqCommandline(_Ncbiblast2SeqCommandline): + """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). + + This is provided for subclassing, it deals with shared options + common to the main BLAST tools blastp, blastn, blastx, tblastx, tblastn + but not psiblast, rpsblast or rpstblastn. + """ + + def __init__(self, cmd=None, **kwargs): + assert cmd is not None + extra_parameters = [ + # Restrict search or results: + _Option( + ["-db_soft_mask", "db_soft_mask"], + "Filtering algorithm for soft masking (integer).\n\n" + "Filtering algorithm ID to apply to BLAST database as soft masking. " + "Incompatible with: db_hard_mask, subject, subject_loc", + equate=False, + ), + _Option( + ["-db_hard_mask", "db_hard_mask"], + "Filtering algorithm for hard masking (integer).\n\n" + "Filtering algorithm ID to apply to BLAST database as hard masking. " + "Incompatible with: db_soft_mask, subject, subject_loc", + equate=False, + ), + ] + try: + # Insert extra parameters - at the start just in case there + # are any arguments which must come last: + self.parameters = extra_parameters + self.parameters + except AttributeError: + # Should we raise an error? The subclass should have set this up! + self.parameters = extra_parameters + _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) + + def _validate(self): + incompatibles = { + "db_soft_mask": ["db_hard_mask", "subject", "subject_loc"], + "db_hard_mask": ["db_soft_mask", "subject", "subject_loc"], + } + self._validate_incompatibilities(incompatibles) + _Ncbiblast2SeqCommandline._validate(self) + + +class NcbiblastpCommandline(_NcbiblastMain2SeqCommandline): + """Create a commandline for the NCBI BLAST+ program blastp (for proteins). + + With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI + replaced the old blastall tool with separate tools for each of the searches. + This wrapper therefore replaces BlastallCommandline with option -p blastp. + + >>> from Bio.Blast.Applications import NcbiblastpCommandline + >>> cline = NcbiblastpCommandline(query="rosemary.pro", db="nr", + ... evalue=0.001, remote=True, ungapped=True) + >>> cline + NcbiblastpCommandline(cmd='blastp', query='rosemary.pro', db='nr', evalue=0.001, remote=True, ungapped=True) + >>> print(cline) + blastp -query rosemary.pro -db nr -evalue 0.001 -remote -ungapped + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="blastp", **kwargs): + """Initialize the class.""" + self.parameters = [ + # General search options: + _Option( + ["-task", "task"], + "Task to execute (string, blastp (default), blastp-fast or blastp-short).", + checker_function=lambda value: value + in ["blastp", "blastp-fast", "blastp-short"], + equate=False, + ), + _Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."), + _Option( + ["-threshold", "threshold"], + "Minimum score for words to be added to the BLAST lookup table (float).", + equate=False, + ), + _Option( + ["-comp_based_stats", "comp_based_stats"], + "Use composition-based statistics (string, default 2, i.e. True).\n\n" + "0, F or f: no composition-based statistics\n\n" + "2, T or t, D or d : Composition-based score adjustment as in " + "Bioinformatics 21:902-911, 2005, conditioned on sequence " + "properties\n\n" + "Note that tblastn also supports values of 1 and 3.", + checker_function=lambda value: value in "0Ft2TtDd", + equate=False, + ), + # Query filtering options: + _Option( + ["-seg", "seg"], + "Filter query sequence with SEG (string).\n\n" + 'Format: "yes", "window locut hicut", or "no" to disable\n' + 'Default is "12 2.2 2.5"', + equate=False, + ), + # Extension options: + _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), + # Miscellaneous options: + _Switch( + ["-use_sw_tback", "use_sw_tback"], + "Compute locally optimal Smith-Waterman alignments?", + ), + ] + _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) + + +class NcbiblastnCommandline(_NcbiblastMain2SeqCommandline): + """Wrapper for the NCBI BLAST+ program blastn (for nucleotides). + + With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI + replaced the old blastall tool with separate tools for each of the searches. + This wrapper therefore replaces BlastallCommandline with option -p blastn. + + For example, to run a search against the "nt" nucleotide database using the + FASTA nucleotide file "m_code.fasta" as the query, with an expectation value + cut off of 0.001, saving the output to a file in XML format: + + >>> from Bio.Blast.Applications import NcbiblastnCommandline + >>> cline = NcbiblastnCommandline(query="m_cold.fasta", db="nt", strand="plus", + ... evalue=0.001, out="m_cold.xml", outfmt=5) + >>> cline + NcbiblastnCommandline(cmd='blastn', out='m_cold.xml', outfmt=5, query='m_cold.fasta', db='nt', evalue=0.001, strand='plus') + >>> print(cline) + blastn -out m_cold.xml -outfmt 5 -query m_cold.fasta -db nt -evalue 0.001 -strand plus + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="blastn", **kwargs): + """Initialize the class.""" + self.parameters = [ + # Input query options: + _Option( + ["-strand", "strand"], + "Query strand(s) to search against database/subject.\n\n" + 'Values allowed are "both" (default), "minus", "plus".', + checker_function=lambda value: value in ["both", "minus", "plus"], + equate=False, + ), + # General search options: + _Option( + ["-task", "task"], + "Task to execute (string, default 'megablast')\n\n" + "Allowed values 'blastn', 'blastn-short', 'dc-megablast', 'megablast' " + "(the default), or 'vecscreen'.", + checker_function=lambda value: value + in ["blastn", "blastn-short", "dc-megablast", "megablast", "vecscreen"], + equate=False, + ), + _Option( + ["-penalty", "penalty"], + "Penalty for a nucleotide mismatch (integer, at most zero).", + equate=False, + ), + _Option( + ["-reward", "reward"], + "Reward for a nucleotide match (integer, at least zero).", + equate=False, + ), + _Option( + ["-use_index", "use_index"], + "Use MegaBLAST database index (Boolean, Default = False)", + equate=False, + ), + _Option( + ["-index_name", "index_name"], + "MegaBLAST database index name.", + equate=False, + ), + # Query filtering options: + _Option( + ["-dust", "dust"], + "Filter query sequence with DUST (string).\n\n" + "Format: 'yes', 'level window linker', or 'no' to disable.\n\n" + "Default = '20 64 1'.", + equate=False, + ), + _Option( + ["-filtering_db", "filtering_db"], + "BLAST database containing filtering elements (i.e. repeats).", + equate=False, + ), + _Option( + ["-window_masker_taxid", "window_masker_taxid"], + "Enable WindowMasker filtering using a Taxonomic ID (integer).", + equate=False, + ), + _Option( + ["-window_masker_db", "window_masker_db"], + "Enable WindowMasker filtering using this repeats database (string).", + equate=False, + ), + # Restrict search or results: + _Option( + ["-perc_identity", "perc_identity"], + "Percent identity (real, 0 to 100 inclusive).", + equate=False, + ), + # Discontiguous MegaBLAST options + _Option( + ["-template_type", "template_type"], + "Discontiguous MegaBLAST template type (string).\n\n" + "Allowed values: 'coding', 'coding_and_optimal' or 'optimal'.\n" + "Requires: template_length.", + checker_function=lambda value: value + in ["coding", "coding_and_optimal", "optimal"], + equate=False, + ), + _Option( + ["-template_length", "template_length"], + "Discontiguous MegaBLAST template length (integer).\n\n" + "Allowed values: 16, 18, 21.\n\n" + "Requires: template_type.", + checker_function=lambda value: value in [16, 18, 21, "16", "18", "21"], + equate=False, + ), + # Extension options: + _Switch( + ["-no_greedy", "no_greedy"], + "Use non-greedy dynamic programming extension", + ), + _Option( + ["-min_raw_gapped_score", "min_raw_gapped_score"], + "Minimum raw gapped score to keep an alignment in the " + "preliminary gapped and traceback stages (integer).", + equate=False, + ), + _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), + _Option( + ["-off_diagonal_range", "off_diagonal_range"], + "Number of off-diagonals to search for the 2nd hit (integer).\n\n" + "Expects a positive integer, or 0 (default) to turn off." + "Added in BLAST 2.2.23+", + equate=False, + ), + ] + _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) + + def _validate(self): + if (self.template_type and not self.template_length) or ( + self.template_length and not self.template_type + ): + raise ValueError( + "Options template_type and template_type require each other." + ) + _NcbiblastMain2SeqCommandline._validate(self) + + +class NcbiblastxCommandline(_NcbiblastMain2SeqCommandline): + """Wrapper for the NCBI BLAST+ program blastx (nucleotide query, protein database). + + With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI + replaced the old blastall tool with separate tools for each of the searches. + This wrapper therefore replaces BlastallCommandline with option -p blastx. + + >>> from Bio.Blast.Applications import NcbiblastxCommandline + >>> cline = NcbiblastxCommandline(query="m_cold.fasta", db="nr", evalue=0.001) + >>> cline + NcbiblastxCommandline(cmd='blastx', query='m_cold.fasta', db='nr', evalue=0.001) + >>> print(cline) + blastx -query m_cold.fasta -db nr -evalue 0.001 + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="blastx", **kwargs): + """Initialize the class.""" + self.parameters = [ + # Input query options: + _Option( + ["-task", "task"], + "Task to execute (string, blastx (default) or blastx-fast).", + checker_function=lambda value: value in ["blastx", "blastx-fast"], + equate=False, + ), + _Option( + ["-strand", "strand"], + "Query strand(s) to search against database/subject.\n\n" + 'Values allowed are "both" (default), "minus", "plus".', + checker_function=lambda value: value in ["both", "minus", "plus"], + equate=False, + ), + # Input query options: + _Option( + ["-query_gencode", "query_gencode"], + "Genetic code to use to translate query (integer, default 1).", + equate=False, + ), + # General search options: + _Option( + ["-frame_shift_penalty", "frame_shift_penalty"], + "Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n" + "This was removed in BLAST 2.2.27+", + equate=False, + ), + _Option( + ["-max_intron_length", "max_intron_length"], + "Maximum intron length (integer).\n\n" + "Length of the largest intron allowed in a translated nucleotide " + "sequence when linking multiple distinct alignments (a negative " + "value disables linking). Default zero.", + equate=False, + ), + _Option( + ["-matrix", "matrix"], + "Scoring matrix name (default BLOSUM62).", + equate=False, + ), + _Option( + ["-threshold", "threshold"], + "Minimum score for words to be added to the BLAST lookup table (float).", + equate=False, + ), + _Option( + ["-comp_based_stats", "comp_based_stats"], + "Use composition-based statistics for blastp, blastx, or tblastn.\n\n" + "D or d: default (equivalent to 2 )\n\n" + "0 or F or f: no composition-based statistics\n\n" + "1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n" + "2 or T or t : Composition-based score adjustment as in " + "Bioinformatics 21:902-911, 2005, conditioned on sequence " + "properties\n\n" + "3: Composition-based score adjustment as in Bioinformatics " + "21:902-911, 2005, unconditionally.\n\n" + "For programs other than tblastn, must either be absent or be " + "D, F or 0\n\n" + "Default = 2.", + equate=False, + ), + # Query filtering options: + _Option( + ["-seg", "seg"], + "Filter query sequence with SEG (string).\n\n" + 'Format: "yes", "window locut hicut", or "no" to disable.' + 'Default is "12 2.2 2.5"', + equate=False, + ), + # Extension options: + _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), + _Switch( + ["-use_sw_tback", "use_sw_tback"], + "Compute locally optimal Smith-Waterman alignments?", + ), + ] + _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) + + +class NcbitblastnCommandline(_NcbiblastMain2SeqCommandline): + """Wrapper for the NCBI BLAST+ program tblastn. + + With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI + replaced the old blastall tool with separate tools for each of the searches. + This wrapper therefore replaces BlastallCommandline with option -p tblastn. + + >>> from Bio.Blast.Applications import NcbitblastnCommandline + >>> cline = NcbitblastnCommandline(help=True) + >>> cline + NcbitblastnCommandline(cmd='tblastn', help=True) + >>> print(cline) + tblastn -help + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="tblastn", **kwargs): + """Initialize the class.""" + self.parameters = [ + # General search options: + _Option( + ["-task", "task"], + "Task to execute (string, tblastn (default) or tblastn-fast).", + checker_function=lambda value: value in ["tblastn", "tblastn-fast"], + equate=False, + ), + _Option( + ["-db_gencode", "db_gencode"], + "Genetic code to use to translate query (integer, default 1).", + equate=False, + ), + _Option( + ["-frame_shift_penalty", "frame_shift_penalty"], + "Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n" + "This was removed in BLAST 2.2.27+", + equate=False, + ), + _Option( + ["-max_intron_length", "max_intron_length"], + "Maximum intron length (integer).\n\n" + "Length of the largest intron allowed in a translated nucleotide " + "sequence when linking multiple distinct alignments (a negative " + "value disables linking). Default zero.", + equate=False, + ), + _Option( + ["-matrix", "matrix"], + "Scoring matrix name (default BLOSUM62).", + equate=False, + ), + _Option( + ["-threshold", "threshold"], + "Minimum score for words to be added to the BLAST lookup table (float).", + equate=False, + ), + _Option( + ["-comp_based_stats", "comp_based_stats"], + "Use composition-based statistics (string, default 2, i.e. True).\n\n" + "0, F or f: no composition-based statistics\n\n" + "1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n" + "2, T or t, D or d : Composition-based score adjustment as in " + "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" + "3: Composition-based score adjustment as in Bioinformatics 21:902-911, " + "2005, unconditionally\n\n" + "Note that only tblastn supports values of 1 and 3.", + checker_function=lambda value: value in "0Ft12TtDd3", + equate=False, + ), + # Query filtering options: + _Option( + ["-seg", "seg"], + "Filter query sequence with SEG (string).\n\n" + 'Format: "yes", "window locut hicut", or "no" to disable.\n\n' + 'Default is "12 2.2 2.5"', + equate=False, + ), + # Extension options: + _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), + # Miscellaneous options: + _Switch( + ["-use_sw_tback", "use_sw_tback"], + "Compute locally optimal Smith-Waterman alignments?", + ), + # PSI-TBLASTN options: + _Option( + ["-in_pssm", "in_pssm"], + "PSI-BLAST checkpoint file.\n\nIncompatible with: remote, query", + filename=True, + equate=False, + ), + ] + _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) + + +class NcbitblastxCommandline(_NcbiblastMain2SeqCommandline): + """Wrapper for the NCBI BLAST+ program tblastx. + + With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI + replaced the old blastall tool with separate tools for each of the searches. + This wrapper therefore replaces BlastallCommandline with option -p tblastx. + + >>> from Bio.Blast.Applications import NcbitblastxCommandline + >>> cline = NcbitblastxCommandline(help=True) + >>> cline + NcbitblastxCommandline(cmd='tblastx', help=True) + >>> print(cline) + tblastx -help + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="tblastx", **kwargs): + """Initialize the class.""" + self.parameters = [ + # Input query options: + _Option( + ["-strand", "strand"], + "Query strand(s) to search against database/subject.\n\n" + 'Values allowed are "both" (default), "minus", "plus".', + checker_function=lambda value: value in ["both", "minus", "plus"], + equate=False, + ), + # Input query options: + _Option( + ["-query_gencode", "query_gencode"], + "Genetic code to use to translate query (integer, default 1).", + equate=False, + ), + # General search options: + _Option( + ["-db_gencode", "db_gencode"], + "Genetic code to use to translate query (integer, default 1).", + equate=False, + ), + _Option( + ["-max_intron_length", "max_intron_length"], + "Maximum intron length (integer).\n\n" + "Length of the largest intron allowed in a translated nucleotide " + "sequence when linking multiple distinct alignments (a negative " + "value disables linking). Default zero.", + equate=False, + ), + _Option( + ["-matrix", "matrix"], + "Scoring matrix name (default BLOSUM62).", + equate=False, + ), + _Option( + ["-threshold", "threshold"], + "Minimum score for words to be added to the BLAST lookup table (float).", + equate=False, + ), + # Query filtering options: + _Option( + ["-seg", "seg"], + "Filter query sequence with SEG (string).\n\n" + 'Format: "yes", "window locut hicut", or "no" to disable.\n\n' + 'Default is "12 2.2 2.5"', + equate=False, + ), + ] + _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs) + + +class NcbipsiblastCommandline(_Ncbiblast2SeqCommandline): + """Wrapper for the NCBI BLAST+ program psiblast. + + With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI + replaced the old blastpgp tool with a similar tool psiblast. This wrapper + therefore replaces BlastpgpCommandline, the wrapper for blastpgp. + + >>> from Bio.Blast.Applications import NcbipsiblastCommandline + >>> cline = NcbipsiblastCommandline(help=True) + >>> cline + NcbipsiblastCommandline(cmd='psiblast', help=True) + >>> print(cline) + psiblast -help + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="psiblast", **kwargs): + """Initialize the class.""" + self.parameters = [ + # General search options: + _Option( + ["-matrix", "matrix"], + "Scoring matrix name (default BLOSUM62).", + equate=False, + ), + _Option( + ["-threshold", "threshold"], + "Minimum score for words to be added to the BLAST lookup table (float).", + equate=False, + ), + _Option( + ["-comp_based_stats", "comp_based_stats"], + "Use composition-based statistics (string, default 2, i.e. True).\n\n" + "0, F or f: no composition-based statistics\n\n" + "2, T or t, D or d : Composition-based score adjustment as in " + "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" + "Note that tblastn also supports values of 1 and 3.", + checker_function=lambda value: value in "0Ft2TtDd", + equate=False, + ), + # Query filtering options: + _Option( + ["-seg", "seg"], + "Filter query sequence with SEG (string).\n\n" + 'Format: "yes", "window locut hicut", or "no" to disable. ' + 'Default is "12 2.2 2.5"', + equate=False, + ), + # Extension options: + _Option( + ["-gap_trigger", "gap_trigger"], + "Number of bits to trigger gapping (float, default 22).", + equate=False, + ), + # Miscellaneous options: + _Switch( + ["-use_sw_tback", "use_sw_tback"], + "Compute locally optimal Smith-Waterman alignments?", + ), + # PSI-BLAST options: + _Option( + ["-num_iterations", "num_iterations"], + "Number of iterations to perform (integer, at least one).\n\n" + "Default is one. Incompatible with: remote", + equate=False, + ), + _Option( + ["-out_pssm", "out_pssm"], + "File name to store checkpoint file.", + filename=True, + equate=False, + ), + _Option( + ["-out_ascii_pssm", "out_ascii_pssm"], + "File name to store ASCII version of PSSM.", + filename=True, + equate=False, + ), + _Switch( + ["-save_pssm_after_last_round", "save_pssm_after_last_round"], + "Save PSSM after the last database search.", + ), + _Switch( + ["-save_each_pssm", "save_each_pssm"], + "Save PSSM after each iteration\n\n" + "File name is given in -save_pssm or -save_ascii_pssm options.", + ), + _Option( + ["-in_msa", "in_msa"], + "File name of multiple sequence alignment to restart PSI-BLAST.\n\n" + "Incompatible with: in_pssm, query", + filename=True, + equate=False, + ), + _Option( + ["-msa_master_idx", "msa_master_idx"], + "Index of sequence to use as master in MSA.\n\n" + "Index (1-based) of sequence to use as the master in the multiple " + "sequence alignment. If not specified, the first sequence is used.", + equate=False, + ), + _Option( + ["-in_pssm", "in_pssm"], + "PSI-BLAST checkpoint file.\n\n" + "Incompatible with: in_msa, query, phi_pattern", + filename=True, + equate=False, + ), + # PSSM engine options: + _Option( + ["-pseudocount", "pseudocount"], + "Pseudo-count value used when constructing PSSM.\n\n" + "Integer. Default is zero.", + equate=False, + ), + _Option( + ["-inclusion_ethresh", "inclusion_ethresh"], + "E-value inclusion threshold for pairwise alignments (float, default 0.002).", + equate=False, + ), + _Switch( + ["-ignore_msa_master", "ignore_msa_master"], + "Ignore the master sequence when creating PSSM.\n\n" + "Requires: in_msa\n" + "Incompatible with: msa_master_idx, in_pssm, query, query_loc, " + "phi_pattern", + ), + # PHI-BLAST options: + _Option( + ["-phi_pattern", "phi_pattern"], + "File name containing pattern to search.\n\n" + "Incompatible with: in_pssm", + filename=True, + equate=False, + ), + ] + _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) + + def _validate(self): + incompatibles = { + "num_iterations": ["remote"], + "in_msa": ["in_pssm", "query"], + "in_pssm": ["in_msa", "query", "phi_pattern"], + "ignore_msa_master": [ + "msa_master_idx", + "in_pssm", + "query", + "query_loc", + "phi_pattern", + ], + } + self._validate_incompatibilities(incompatibles) + _Ncbiblast2SeqCommandline._validate(self) + + +class NcbirpsblastCommandline(_NcbiblastCommandline): + """Wrapper for the NCBI BLAST+ program rpsblast. + + With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI + replaced the old rpsblast tool with a similar tool of the same name. This + wrapper replaces RpsBlastCommandline, the wrapper for the old rpsblast. + + >>> from Bio.Blast.Applications import NcbirpsblastCommandline + >>> cline = NcbirpsblastCommandline(help=True) + >>> cline + NcbirpsblastCommandline(cmd='rpsblast', help=True) + >>> print(cline) + rpsblast -help + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="rpsblast", **kwargs): + """Initialize the class.""" + # TODO - remove the -word_size argument as per BLAST+ 2.2.30 + # (BLAST team say it should never have been included, since + # the word size is set when building the domain database.) + # This likely means reviewing the class hierarchy again. + self.parameters = [ + # Query filtering options: + _Option( + ["-seg", "seg"], + "Filter query sequence with SEG (string).\n\n" + 'Format: "yes", "window locut hicut", or "no" to disable.' + 'Default is "12 2.2 2.5"', + equate=False, + ), + # Restrict search or results: + _Option( + ["-culling_limit", "culling_limit"], + "Hit culling limit (integer).\n\n" + "If the query range of a hit is enveloped by that of at " + "least this many higher-scoring hits, delete the hit. " + "Incompatible with: best_hit_overhang, best_hit_score_edge.", + equate=False, + ), + _Option( + ["-best_hit_overhang", "best_hit_overhang"], + "Best Hit algorithm overhang value (recommended value: 0.1).\n\n" + "Float between 0.0 and 0.5 inclusive. " + "Incompatible with: culling_limit.", + equate=False, + ), + _Option( + ["-best_hit_score_edge", "best_hit_score_edge"], + "Best Hit algorithm score edge value (recommended value: 0.1).\n\n" + "Float between 0.0 and 0.5 inclusive. " + "Incompatible with: culling_limit.", + equate=False, + ), + # General search options: + _Option( + ["-comp_based_stats", "comp_based_stats"], + "Use composition-based statistics.\n\n" + "D or d: default (equivalent to 0)\n\n" + "0 or F or f: Simplified Composition-based statistics as in " + "Bioinformatics 15:1000-1011, 1999\n\n" + "1 or T or t: Composition-based statistics as in NAR 29:2994-3005, " + "2001\n\n" + "Default = 0.", + checker_function=lambda value: value in "Dd0Ff1Tt", + equate=False, + ), + # Misc options: + _Switch( + ["-use_sw_tback", "use_sw_tback"], + "Compute locally optimal Smith-Waterman alignments?", + ), + ] + _NcbiblastCommandline.__init__(self, cmd, **kwargs) + + def _validate(self): + incompatibles = {"culling_limit": ["best_hit_overhang", "best_hit_score_edge"]} + self._validate_incompatibilities(incompatibles) + _NcbiblastCommandline._validate(self) + + +class NcbirpstblastnCommandline(_NcbiblastCommandline): + """Wrapper for the NCBI BLAST+ program rpstblastn. + + With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI + replaced the old rpsblast tool with a similar tool of the same name, and a + separate tool rpstblastn for Translated Reverse Position Specific BLAST. + + >>> from Bio.Blast.Applications import NcbirpstblastnCommandline + >>> cline = NcbirpstblastnCommandline(help=True) + >>> cline + NcbirpstblastnCommandline(cmd='rpstblastn', help=True) + >>> print(cline) + rpstblastn -help + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="rpstblastn", **kwargs): + """Initialize the class.""" + # TODO - remove the -word_size argument as per BLAST+ 2.2.30 + # (BLAST team say it should never have been included, since + # the word size is set when building the domain database.) + # This likely means reviewing the class hierarchy again. + self.parameters = [ + # Input query options: + _Option( + ["-strand", "strand"], + "Query strand(s) to search against database/subject.\n\n" + 'Values allowed are "both" (default), "minus", "plus".', + checker_function=lambda value: value in ["both", "minus", "plus"], + equate=False, + ), + # Input query options: + _Option( + ["-query_gencode", "query_gencode"], + "Genetic code to use to translate query (integer, default 1).", + equate=False, + ), + # Query filtering options: + _Option( + ["-seg", "seg"], + "Filter query sequence with SEG (string).\n\n" + 'Format: "yes", "window locut hicut", or "no" to disable. ' + 'Default is "12 2.2 2.5"', + equate=False, + ), + # General search options: + _Option( + ["-comp_based_stats", "comp_based_stats"], + "Use composition-based statistics.\n\n" + "D or d: default (equivalent to 0)\n\n" + "0 or F or f: Simplified Composition-based statistics as in " + "Bioinformatics 15:1000-1011, 1999\n\n" + "1 or T or t: Composition-based statistics as in NAR 29:2994-3005, " + "2001\n\n" + "Default = 0.", + checker_function=lambda value: value in "Dd0Ff1Tt", + equate=False, + ), + # Extension options: + _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"), + # Miscellaneous options: + _Switch( + ["-use_sw_tback", "use_sw_tback"], + "Compute locally optimal Smith-Waterman alignments?", + ), + ] + _NcbiblastCommandline.__init__(self, cmd, **kwargs) + + +class NcbiblastformatterCommandline(_NcbibaseblastCommandline): + """Wrapper for the NCBI BLAST+ program blast_formatter. + + With the release of BLAST 2.2.24+ (i.e. the BLAST suite rewritten in C++ + instead of C), the NCBI added the ASN.1 output format option to all the + search tools, and extended the blast_formatter to support this as input. + + The blast_formatter command allows you to convert the ASN.1 output into + the other output formats (XML, tabular, plain text, HTML). + + >>> from Bio.Blast.Applications import NcbiblastformatterCommandline + >>> cline = NcbiblastformatterCommandline(archive="example.asn", outfmt=5, out="example.xml") + >>> cline + NcbiblastformatterCommandline(cmd='blast_formatter', out='example.xml', outfmt=5, archive='example.asn') + >>> print(cline) + blast_formatter -out example.xml -outfmt 5 -archive example.asn + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + + Note that this wrapper is for the version of blast_formatter from BLAST + 2.2.24+ (or later) which is when the NCBI first announced the inclusion + this tool. There was actually an early version in BLAST 2.2.23+ (and + possibly in older releases) but this did not have the -archive option + (instead -rid is a mandatory argument), and is not supported by this + wrapper. + """ + + def __init__(self, cmd="blast_formatter", **kwargs): + """Initialize the class.""" + self.parameters = [ + # Input options + _Option( + ["-rid", "rid"], + "BLAST Request ID (RID), not compatible with archive arg.", + equate=False, + ), + _Option( + ["-archive", "archive"], + "Archive file of results, not compatible with rid arg.", + filename=True, + equate=False, + ), + # Restrict search or results + _Option( + ["-max_target_seqs", "max_target_seqs"], + "Maximum number of aligned sequences to keep.", + checker_function=lambda value: value >= 1, + equate=False, + ), + ] + _NcbibaseblastCommandline.__init__(self, cmd, **kwargs) + + def _validate(self): + incompatibles = {"rid": ["archive"]} + self._validate_incompatibilities(incompatibles) + _NcbibaseblastCommandline._validate(self) + + +class NcbideltablastCommandline(_Ncbiblast2SeqCommandline): + """Create a commandline for the NCBI BLAST+ program deltablast (for proteins). + + This is a wrapper for the deltablast command line command included in + the NCBI BLAST+ software (not present in the original BLAST). + + >>> from Bio.Blast.Applications import NcbideltablastCommandline + >>> cline = NcbideltablastCommandline(query="rosemary.pro", db="nr", + ... evalue=0.001, remote=True) + >>> cline + NcbideltablastCommandline(cmd='deltablast', query='rosemary.pro', db='nr', evalue=0.001, remote=True) + >>> print(cline) + deltablast -query rosemary.pro -db nr -evalue 0.001 -remote + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="deltablast", **kwargs): + """Initialize the class.""" + self.parameters = [ + # General search options: + _Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."), + _Option( + ["-threshold", "threshold"], + "Minimum score for words to be added to the BLAST lookup table (float).", + equate=False, + ), + _Option( + ["-comp_based_stats", "comp_based_stats"], + "Use composition-based statistics (string, default 2, i.e. True).\n\n" + "0, F or f: no composition-based statistics.\n\n" + "2, T or t, D or d : Composition-based score adjustment as in " + "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n" + "Note that tblastn also supports values of 1 and 3.", + checker_function=lambda value: value in "0Ft2TtDd", + equate=False, + ), + # Query filtering options: + _Option( + ["-seg", "seg"], + "Filter query sequence with SEG (string).\n\n" + 'Format: "yes", "window locut hicut", or "no" to disable. ' + 'Default is "12 2.2 2.5"', + equate=False, + ), + # Extension options: + _Option( + ["-gap_trigger", "gap_trigger"], + "Number of bits to trigger gapping. Default = 22.", + equate=False, + ), + # Miscellaneous options: + _Switch( + ["-use_sw_tback", "use_sw_tback"], + "Compute locally optimal Smith-Waterman alignments?", + ), + # PSI-BLAST options + _Option( + ["-num_iterations", "num_iterations"], + "Number of iterations to perform. (integer >=1, Default is 1).\n\n" + "Incompatible with: remote", + equate=False, + ), + _Option( + ["-out_pssm", "out_pssm"], + "File name to store checkpoint file.", + filename=True, + equate=False, + ), + _Option( + ["-out_ascii_pssm", "out_ascii_pssm"], + "File name to store ASCII version of PSSM.", + filename=True, + equate=False, + ), + _Switch( + ["-save_pssm_after_last_round", "save_pssm_after_last_round"], + "Save PSSM after the last database search.", + ), + _Switch( + ["-save_each_pssm", "save_each_pssm"], + "Save PSSM after each iteration.\n\n" + "File name is given in -save_pssm or -save_ascii_pssm options.", + ), + # PSSM engine options + _Option( + ["-pseudocount", "pseudocount"], + "Pseudo-count value used when constructing PSSM (integer, default 0).", + equate=False, + ), + _Option( + ["-domain_inclusion_ethresh", "domain_inclusion_ethresh"], + "E-value inclusion threshold for alignments with conserved domains.\n\n" + "(float, Default is 0.05)", + equate=False, + ), + _Option( + ["-inclusion_ethresh", "inclusion_ethresh"], + "Pairwise alignment e-value inclusion threshold (float, default 0.002).", + equate=False, + ), + # DELTA-BLAST options + _Option( + ["-rpsdb", "rpsdb"], + "BLAST domain database name (dtring, Default = 'cdd_delta').", + equate=False, + ), + _Switch( + ["-show_domain_hits", "show_domain_hits"], + "Show domain hits?\n\nIncompatible with: remote, subject", + ), + ] + _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs) + + +class NcbimakeblastdbCommandline(AbstractCommandline): + """Wrapper for the NCBI BLAST+ program makeblastdb. + + This is a wrapper for the NCBI BLAST+ makeblastdb application + to create BLAST databases. By default, this creates a blast database + with the same name as the input file. The default output location + is the same directory as the input. + + >>> from Bio.Blast.Applications import NcbimakeblastdbCommandline + >>> cline = NcbimakeblastdbCommandline(dbtype="prot", + ... input_file="NC_005816.faa") + >>> cline + NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='prot', input_file='NC_005816.faa') + >>> print(cline) + makeblastdb -dbtype prot -in NC_005816.faa + + You would typically run the command line with cline() or via the Python + subprocess module, as described in the Biopython tutorial. + """ + + def __init__(self, cmd="makeblastdb", **kwargs): + """Initialize the class.""" + self.parameters = [ + # Basic input options + _Switch( + ["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments." + ), + _Switch( + ["-help", "help"], + "Print USAGE, DESCRIPTION and ARGUMENTS description; " + "ignore other arguments.", + ), + _Switch( + ["-version", "version"], + "Print version number; ignore other arguments.", + ), + # Output configuration options + _Option( + ["-out", "out"], + "Output file for alignment.", + filename=True, + equate=False, + ), + # makeblastdb specific options + _Option( + ["-blastdb_version", "blastdb_version"], + "Version of BLAST database to be created. " + "Tip: use BLAST database version 4 on 32 bit CPU. " + "Default = 5", + equate=False, + checker_function=lambda x: x == 4 or x == 5, + ), + _Option( + ["-dbtype", "dbtype"], + "Molecule type of target db ('nucl' or 'prot').", + equate=False, + is_required=True, + checker_function=lambda x: x == "nucl" or x == "prot", + ), + _Option( + ["-in", "input_file"], + "Input file/database name.", + filename=True, + equate=False, + ), + _Option( + ["-input_type", "input_type"], + "Type of the data specified in input_file.\n\n" + "Default = 'fasta'. Added in BLAST 2.2.26.", + filename=False, + equate=False, + checker_function=self._input_type_checker, + ), + _Option( + ["-title", "title"], + "Title for BLAST database.", + filename=False, + equate=False, + ), + _Switch( + ["-parse_seqids", "parse_seqids"], + "Option to parse seqid for FASTA input if set.\n\n" + "For all other input types, seqids are parsed automatically", + ), + _Switch( + ["-hash_index", "hash_index"], "Create index of sequence hash values." + ), + _Option( + ["-mask_data", "mask_data"], + "Comma-separated list of input files containing masking " + "data as produced by NCBI masking applications " + "(e.g. dustmasker, segmasker, windowmasker).", + filename=True, + equate=False, + ), + _Option( + ["-mask_id", "mask_id"], + "Comma-separated list of strings to uniquely identify the " + "masking algorithm.", + filename=False, + equate=False, + ), + _Option( + ["-mask_desc", "mask_desc"], + "Comma-separated list of free form strings to describe " + "the masking algorithm details.", + filename=False, + equate=False, + ), + _Switch(["-gi_mask", "gi_mask"], "Create GI indexed masking data."), + _Option( + ["-gi_mask_name", "gi_mask_name"], + "Comma-separated list of masking data output files.", + filename=False, + equate=False, + ), + _Option( + ["-max_file_sz", "max_file_sz"], + "Maximum file size for BLAST database files. Default = '1GB'.", + filename=False, + equate=False, + ), + _Option( + ["-logfile", "logfile"], + "File to which the program log should be redirected.", + filename=True, + equate=False, + ), + _Option( + ["-taxid", "taxid"], + "Taxonomy ID to assign to all sequences.", + filename=False, + equate=False, + checker_function=lambda x: type(x)(int(x)) == x, + ), + _Option( + ["-taxid_map", "taxid_map"], + "Text file mapping sequence IDs to taxonomy IDs.\n\n" + "Format: ", + filename=True, + equate=False, + ), + ] + AbstractCommandline.__init__(self, cmd, **kwargs) + + def _input_type_checker(self, command): + return command in ("asn1_bin", "asn1_txt", "blastdb", "fasta") + + def _validate(self): + incompatibles = { + "mask_id": ["gi_mask"], + "gi_mask": ["mask_id"], + "taxid": ["taxid_map"], + } + + # Copied from _NcbibaseblastCommandline class above. + # Code repeated here for python2 and 3 compatibility, + # because this is not a _NcbibaseblastCommandline subclass. + for a in incompatibles: + if self._get_parameter(a): + for b in incompatibles[a]: + if self._get_parameter(b): + raise ValueError("Options %s and %s are incompatible." % (a, b)) + + if self.mask_id and not self.mask_data: + raise ValueError("Option mask_id requires mask_data to be set.") + if self.mask_desc and not self.mask_id: + raise ValueError("Option mask_desc requires mask_id to be set.") + if self.gi_mask and not self.parse_seqids: + raise ValueError("Option gi_mask requires parse_seqids to be set.") + if self.gi_mask_name and not (self.mask_data and self.gi_mask): + raise ValueError( + "Option gi_mask_name requires mask_data and gi_mask to be set." + ) + if self.taxid_map and not self.parse_seqids: + raise ValueError("Option taxid_map requires parse_seqids to be set.") + AbstractCommandline._validate(self) + + +def _test(): + """Run the Bio.Blast.Applications module's doctests (PRIVATE).""" + import doctest + + doctest.testmod(verbose=1) + + +if __name__ == "__main__": + # Run the doctests + _test() diff --git a/code/lib/Bio/Blast/NCBIWWW.py b/code/lib/Bio/Blast/NCBIWWW.py new file mode 100644 index 0000000..4bcca3f --- /dev/null +++ b/code/lib/Bio/Blast/NCBIWWW.py @@ -0,0 +1,348 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Patched by Brad Chapman. +# Chris Wroe added modifications for work in myGrid + +"""Code to invoke the NCBI BLAST server over the internet. + +This module provides code to work with the WWW version of BLAST +provided by the NCBI. https://blast.ncbi.nlm.nih.gov/ +""" + + +import warnings + +from io import StringIO +import time + +from urllib.request import urlopen +from urllib.parse import urlencode +from urllib.request import Request + +from Bio import BiopythonWarning + + +NCBI_BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" + + +def qblast( + program, + database, + sequence, + url_base=NCBI_BLAST_URL, + auto_format=None, + composition_based_statistics=None, + db_genetic_code=None, + endpoints=None, + entrez_query="(none)", + expect=10.0, + filter=None, + gapcosts=None, + genetic_code=None, + hitlist_size=50, + i_thresh=None, + layout=None, + lcase_mask=None, + matrix_name=None, + nucl_penalty=None, + nucl_reward=None, + other_advanced=None, + perc_ident=None, + phi_pattern=None, + query_file=None, + query_believe_defline=None, + query_from=None, + query_to=None, + searchsp_eff=None, + service=None, + threshold=None, + ungapped_alignment=None, + word_size=None, + short_query=None, + alignments=500, + alignment_view=None, + descriptions=500, + entrez_links_new_window=None, + expect_low=None, + expect_high=None, + format_entrez_query=None, + format_object=None, + format_type="XML", + ncbi_gi=None, + results_file=None, + show_overview=None, + megablast=None, + template_type=None, + template_length=None, +): + """BLAST search using NCBI's QBLAST server or a cloud service provider. + + Supports all parameters of the old qblast API for Put and Get. + + Please note that NCBI uses the new Common URL API for BLAST searches + on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus, + some of the parameters used by this function are not (or are no longer) + officially supported by NCBI. Although they are still functioning, this + may change in the future. + + The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows + doing BLAST searches on cloud servers. To use this feature, please set + ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'`` + and ``format_object='Alignment'``. For more details, please see + https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast + + Some useful parameters: + + - program blastn, blastp, blastx, tblastn, or tblastx (lower case) + - database Which database to search against (e.g. "nr"). + - sequence The sequence to search. + - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. + - descriptions Number of descriptions to show. Def 500. + - alignments Number of alignments to show. Def 500. + - expect An expect value cutoff. Def 10.0. + - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). + - filter "none" turns off filtering. Default no filtering + - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". + - entrez_query Entrez query to limit Blast search + - hitlist_size Number of hits to return. Default 50 + - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) + - short_query TRUE/FALSE whether to adjust the search parameters for a + short query sequence. Note that this will override + manually set parameters like word size and e value. Turns + off when sequence length is > 30 residues. Default: None. + - service plain, psi, phi, rpsblast, megablast (lower case) + + This function does no checking of the validity of the parameters + and passes the values to the server as is. More help is available at: + https://ncbi.github.io/blast-cloud/dev/api.html + + """ + programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"] + if program not in programs: + raise ValueError( + "Program specified is %s. Expected one of %s" + % (program, ", ".join(programs)) + ) + + # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter + # assignment from NCBIs side). + # Thus we set the (known) parameters directly: + if short_query and program == "blastn": + short_query = None + # We only use the 'short-query' parameters for short sequences: + if len(sequence) < 31: + expect = 1000 + word_size = 7 + nucl_reward = 1 + filter = None + lcase_mask = None + warnings.warn( + '"SHORT_QUERY_ADJUST" is incorrectly implemented (by NCBI) for blastn.' + " We bypass the problem by manually adjusting the search parameters." + " Thus, results may slightly differ from web page searches.", + BiopythonWarning, + ) + + # Format the "Put" command, which sends search requests to qblast. + # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 + # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 + # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified + # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) + parameters = [ + ("AUTO_FORMAT", auto_format), + ("COMPOSITION_BASED_STATISTICS", composition_based_statistics), + ("DATABASE", database), + ("DB_GENETIC_CODE", db_genetic_code), + ("ENDPOINTS", endpoints), + ("ENTREZ_QUERY", entrez_query), + ("EXPECT", expect), + ("FILTER", filter), + ("GAPCOSTS", gapcosts), + ("GENETIC_CODE", genetic_code), + ("HITLIST_SIZE", hitlist_size), + ("I_THRESH", i_thresh), + ("LAYOUT", layout), + ("LCASE_MASK", lcase_mask), + ("MEGABLAST", megablast), + ("MATRIX_NAME", matrix_name), + ("NUCL_PENALTY", nucl_penalty), + ("NUCL_REWARD", nucl_reward), + ("OTHER_ADVANCED", other_advanced), + ("PERC_IDENT", perc_ident), + ("PHI_PATTERN", phi_pattern), + ("PROGRAM", program), + # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? + ("QUERY", sequence), + ("QUERY_FILE", query_file), + ("QUERY_BELIEVE_DEFLINE", query_believe_defline), + ("QUERY_FROM", query_from), + ("QUERY_TO", query_to), + # ('RESULTS_FILE',...), - Can we use this parameter? + ("SEARCHSP_EFF", searchsp_eff), + ("SERVICE", service), + ("SHORT_QUERY_ADJUST", short_query), + ("TEMPLATE_TYPE", template_type), + ("TEMPLATE_LENGTH", template_length), + ("THRESHOLD", threshold), + ("UNGAPPED_ALIGNMENT", ungapped_alignment), + ("WORD_SIZE", word_size), + ("CMD", "Put"), + ] + query = [x for x in parameters if x[1] is not None] + message = urlencode(query).encode() + + # Send off the initial query to qblast. + # Note the NCBI do not currently impose a rate limit here, other + # than the request not to make say 50 queries at once using multiple + # threads. + request = Request(url_base, message, {"User-Agent": "BiopythonClient"}) + handle = urlopen(request) + + # Format the "Get" command, which gets the formatted results from qblast + # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 + rid, rtoe = _parse_qblast_ref_page(handle) + parameters = [ + ("ALIGNMENTS", alignments), + ("ALIGNMENT_VIEW", alignment_view), + ("DESCRIPTIONS", descriptions), + ("ENTREZ_LINKS_NEW_WINDOW", entrez_links_new_window), + ("EXPECT_LOW", expect_low), + ("EXPECT_HIGH", expect_high), + ("FORMAT_ENTREZ_QUERY", format_entrez_query), + ("FORMAT_OBJECT", format_object), + ("FORMAT_TYPE", format_type), + ("NCBI_GI", ncbi_gi), + ("RID", rid), + ("RESULTS_FILE", results_file), + ("SERVICE", service), + ("SHOW_OVERVIEW", show_overview), + ("CMD", "Get"), + ] + query = [x for x in parameters if x[1] is not None] + message = urlencode(query).encode() + + # Poll NCBI until the results are ready. + # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo + # 1. Do not contact the server more often than once every 10 seconds. + # 2. Do not poll for any single RID more often than once a minute. + # 3. Use the URL parameter email and tool, so that the NCBI + # can contact you if there is a problem. + # 4. Run scripts weekends or between 9 pm and 5 am Eastern time + # on weekdays if more than 50 searches will be submitted. + # -- + # Could start with a 10s delay, but expect most short queries + # will take longer thus at least 70s with delay. Therefore, + # start with 20s delay, thereafter once a minute. + delay = 20 # seconds + while True: + current = time.time() + wait = qblast._previous + delay - current + if wait > 0: + time.sleep(wait) + qblast._previous = current + wait + else: + qblast._previous = current + # delay by at least 60 seconds only if running the request against the public NCBI API + if delay < 60 and url_base == NCBI_BLAST_URL: + # Wasn't a quick return, must wait at least a minute + delay = 60 + + request = Request(url_base, message, {"User-Agent": "BiopythonClient"}) + handle = urlopen(request) + results = handle.read().decode() + + # Can see an "\n\n" page while results are in progress, + # if so just wait a bit longer... + if results == "\n\n": + continue + # XML results don't have the Status tag when finished + if "Status=" not in results: + break + i = results.index("Status=") + j = results.index("\n", i) + status = results[i + len("Status=") : j].strip() + if status.upper() == "READY": + break + return StringIO(results) + + +qblast._previous = 0 + + +def _parse_qblast_ref_page(handle): + """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). + + The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably + 'Request Time of Execution' and RID would be 'Request Identifier'. + """ + s = handle.read().decode() + i = s.find("RID =") + if i == -1: + rid = None + else: + j = s.find("\n", i) + rid = s[i + len("RID =") : j].strip() + + i = s.find("RTOE =") + if i == -1: + rtoe = None + else: + j = s.find("\n", i) + rtoe = s[i + len("RTOE =") : j].strip() + + if not rid and not rtoe: + # Can we reliably extract the error message from the HTML page? + # e.g. "Message ID#24 Error: Failed to read the Blast query: + # Nucleotide FASTA provided for protein sequence" + # or "Message ID#32 Error: Query contains no data: Query + # contains no sequence data" + # + # This used to occur inside a
entry: + i = s.find('
') + if i != -1: + msg = s[i + len('
') :].strip() + msg = msg.split("
", 1)[0].split("\n", 1)[0].strip() + if msg: + raise ValueError("Error message from NCBI: %s" % msg) + # In spring 2010 the markup was like this: + i = s.find('

') + if i != -1: + msg = s[i + len('

') :].strip() + msg = msg.split("

", 1)[0].split("\n", 1)[0].strip() + if msg: + raise ValueError("Error message from NCBI: %s" % msg) + # Generic search based on the way the error messages start: + i = s.find("Message ID#") + if i != -1: + # Break the message at the first HTML tag + msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip() + raise ValueError("Error message from NCBI: %s" % msg) + # We didn't recognise the error layout :( + # print(s) + raise ValueError( + "No RID and no RTOE found in the 'please wait' page, " + "there was probably an error in your request but we " + "could not extract a helpful error message." + ) + elif not rid: + # Can this happen? + raise ValueError( + "No RID found in the 'please wait' page. (although RTOE = %r)" % rtoe + ) + elif not rtoe: + # Can this happen? + raise ValueError( + "No RTOE found in the 'please wait' page. (although RID = %r)" % rid + ) + + try: + return rid, int(rtoe) + except ValueError: + raise ValueError( + "A non-integer RTOE found in the 'please wait' page, %r" % rtoe + ) from None diff --git a/code/lib/Bio/Blast/NCBIXML.py b/code/lib/Bio/Blast/NCBIXML.py new file mode 100644 index 0000000..90e91a9 --- /dev/null +++ b/code/lib/Bio/Blast/NCBIXML.py @@ -0,0 +1,864 @@ +# Copyright 2000 by Bertrand Frottier. All rights reserved. +# Revisions 2005-2006 copyright Michiel de Hoon +# Revisions 2006-2009 copyright Peter Cock +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Code to work with the BLAST XML output. + +The BLAST XML DTD file is on the NCBI FTP site at: +ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd +""" + +from Bio.Blast import Record +import xml.sax +from xml.sax.handler import ContentHandler + + +class _XMLparser(ContentHandler): + """Generic SAX Parser (PRIVATE). + + Just a very basic SAX parser. + + Redefine the methods startElement, characters and endElement. + """ + + def __init__(self, debug=0): + """Initialize the parser. + + Arguments: + - debug - integer, amount of debug information to print + + """ + self._tag = [] + self._value = "" + self._debug = debug + self._debug_ignore_list = [] + self._method_name_level = 1 + self._method_map = None + + def startElement(self, name, attr): + """Found XML start tag. + + No real need of attr, BLAST DTD doesn't use them + + Arguments: + - name -- name of the tag + - attr -- tag attributes + + """ + self._tag.append(name) + + if len(self._tag) == 1: + # root node + self._on_root_node(name) + return + + # Try to call a method (defined in subclasses) + method = "start_" + self._node_method_name(name) + + # Note could use try / except AttributeError + # BUT I found often triggered by nested errors... + if method in self._method_map: + self._method_map[method]() + if self._debug > 4: + print("NCBIXML: Parsed: " + method) + elif self._debug > 3: + # Doesn't exist (yet) and may want to warn about it + if method not in self._debug_ignore_list: + print("NCBIXML: Ignored: " + method) + self._debug_ignore_list.append(method) + + # We don't care about white space in parent tags like Hsp, + # but that white space doesn't belong to child tags like Hsp_midline + if self._value.strip(): + raise ValueError( + "What should we do with %s before the %r tag?" % (self._value, name) + ) + self._value = "" + + def characters(self, ch): + """Found some text. + + Arguments: + - ch -- characters read + + """ + self._value += ch # You don't ever get the whole string + + def endElement(self, name): + """Found XML end tag. + + Arguments: + - name -- tag name + + """ + # DON'T strip any white space, we may need it e.g. the hsp-midline + + # Try to call a method (defined in subclasses) + method = "end_" + self._node_method_name(name) + + # Note could use try / except AttributeError + # BUT I found often triggered by nested errors... + if method in self._method_map: + self._method_map[method]() + if self._debug > 2: + print("NCBIXML: Parsed: %s %s" % (method, self._value)) + elif self._debug > 1: + # Doesn't exist (yet) and may want to warn about it + if method not in self._debug_ignore_list: + print("NCBIXML: Ignored: %s %s" % (method, self._value)) + self._debug_ignore_list.append(method) + + # Reset character buffer + self._value = "" + + self._tag.pop() + + def _node_method_name(self, name): + if self._method_name_level == 1: + return name + return "/".join(self._tag[-self._method_name_level :]) + + +class BlastParser(_XMLparser): + """Parse XML BLAST data into a Record.Blast object. + + Parses XML output from BLAST (direct use discouraged). + This (now) returns a list of Blast records. + Historically it returned a single Blast record. + You are expected to use this via the parse or read functions. + + All XML 'action' methods are private methods and may be: + + - ``_start_TAG`` called when the start tag is found + - ``_end_TAG`` called when the end tag is found + + """ + + def __init__(self, debug=0): + """Initialize the parser. + + Arguments: + - debug - integer, amount of debug information to print + + """ + # Calling superclass method + _XMLparser.__init__(self, debug) + + self._parser = xml.sax.make_parser() + self._parser.setContentHandler(self) + + # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd + self._parser.setFeature(xml.sax.handler.feature_validation, 0) + self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) + self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) + self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) + + self._xml_version = 1 + + self.reset() + + def reset(self): + """Reset all the data allowing reuse of the BlastParser() object.""" + self._records = [] + self._header = Record.Header() + self._parameters = Record.Parameters() + self._parameters.filter = None # Maybe I should update the class? + + def _on_root_node(self, name): + if name == "BlastOutput": + self._setup_blast_v1() + elif name == "BlastXML2": + self._setup_blast_v2() + else: + raise ValueError( + "Invalid root node name: %s. Root node should be either" + " BlastOutput or BlastXML2" % name + ) + + def _setup_blast_v1(self): + self._method_map = { + "start_Iteration": self._start_blast_record, + "end_Iteration": self._end_blast_record, + "end_BlastOutput_program": self._set_header_application, + "end_BlastOutput_version": self._set_header_version, + "end_BlastOutput_reference": self._set_header_reference, + "end_BlastOutput_db": self._set_header_database, + "end_BlastOutput_query-ID": self._set_header_query_id, + "end_BlastOutput_query-def": self._set_header_query, + "end_BlastOutput_query-len": self._set_header_query_letters, + "end_Iteration_query-ID": self._set_record_query_id, + "end_Iteration_query-def": self._set_record_query_def, + "end_Iteration_query-len": self._set_record_query_letters, + "end_BlastOutput_hits": self._set_record_hits, + "end_Parameters_matrix": self._set_parameters_matrix, + "end_Parameters_expect": self._set_parameters_expect, + "end_Parameters_sc-match": self._set_parameters_sc_match, + "end_Parameters_sc-mismatch": self._set_parameters_sc_mismatch, + "end_Parameters_gap-open": self._set_parameters_gap_penalties, + "end_Parameters_gap-extend": self._set_parameters_gap_extend, + "end_Parameters_filter": self._set_parameters_filter, + "start_Hit": self._start_hit, + "end_Hit": self._end_hit, + "end_Hit_id": self.set_hit_id, + "end_Hit_def": self.set_hit_def, + "end_Hit_accession": self.set_hit_accession, + "end_Hit_len": self.set_hit_len, + "start_Hsp": self._start_hsp, + "end_Hsp_score": self._set_hsp_score, + "end_Hsp_bit-score": self._set_hsp_bit_score, + "end_Hsp_evalue": self._set_hsp_e_value, + "end_Hsp_query-from": self._set_hsp_query_start, + "end_Hsp_query-to": self._set_hsp_query_end, + "end_Hsp_hit-from": self._set_hsp_hit_from, + "end_Hsp_hit-to": self._set_hsp_hit_to, + "end_Hsp_query-frame": self._set_hsp_query_frame, + "end_Hsp_hit-frame": self._set_hsp_hit_frame, + "end_Hsp_identity": self._set_hsp_identity, + "end_Hsp_positive": self._set_hsp_positive, + "end_Hsp_gaps": self._set_hsp_gaps, + "end_Hsp_align-len": self._set_hsp_align_len, + "end_Hsp_qseq": self._set_hsp_query_seq, + "end_Hsp_hseq": self._set_hsp_subject_seq, + "end_Hsp_midline": self._set_hsp_midline, + "end_Statistics_db-num": self._set_statistics_db_num, + "end_Statistics_db-len": self._set_statistics_db_len, + "end_Statistics_hsp-len": self._set_statistics_hsp_len, + "end_Statistics_eff-space": self._set_statistics_eff_space, + "end_Statistics_kappa": self._set_statistics_kappa, + "end_Statistics_lambda": self._set_statistics_lambda, + "end_Statistics_entropy": self._set_statistics_entropy, + } + + def _setup_blast_v2(self): + self._method_name_level = 2 + self._xml_version = 2 + self._method_map = { + "start_report/Report": self._start_blast_record, + "end_report/Report": self._end_blast_record, + "end_Report/program": self._set_header_application, + "end_Report/version": self._set_header_version, + "end_Report/reference": self._set_header_reference, + "end_Target/db": self._set_header_database, + "end_Search/query-id": self._set_record_query_id, + "end_Search/query-title": self._set_record_query_def, + "end_Search/query-len": self._set_record_query_letters, + "end_BlastOutput_hits": self._set_record_hits, + "end_Parameters/matrix": self._set_parameters_matrix, + "end_Parameters/expect": self._set_parameters_expect, + "end_Parameters/sc-match": self._set_parameters_sc_match, + "end_Parameters/sc-mismatch": self._set_parameters_sc_mismatch, + "end_Parameters/gap-open": self._set_parameters_gap_penalties, + "end_Parameters/gap-extend": self._set_parameters_gap_extend, + "end_Parameters/filter": self._set_parameters_filter, + "start_hits/Hit": self._start_hit, + "end_hits/Hit": self._end_hit, + "start_description/HitDescr": self._start_hit_descr_item, + "end_description/HitDescr": self._end_hit_descr_item, + "end_HitDescr/id": self._end_description_id, + "end_HitDescr/accession": self._end_description_accession, + "end_HitDescr/title": self._end_description_title, + "end_HitDescr/taxid": self._end_description_taxid, + "end_HitDescr/sciname": self._end_description_sciname, + "end_Hit/len": self.set_hit_len, + "start_hsps/Hsp": self._start_hsp, + "end_hsps/Hsp": self._end_hsp, + "end_Hsp/score": self._set_hsp_score, + "end_Hsp/bit-score": self._set_hsp_bit_score, + "end_Hsp/evalue": self._set_hsp_e_value, + "end_Hsp/query-from": self._set_hsp_query_start, + "end_Hsp/query-to": self._set_hsp_query_end, + "end_Hsp/hit-from": self._set_hsp_hit_from, + "end_Hsp/hit-to": self._set_hsp_hit_to, + "end_Hsp/query-frame": self._set_hsp_query_frame, + "end_Hsp/hit-frame": self._set_hsp_hit_frame, + "end_Hsp/query-strand": self._set_hsp_query_strand, + "end_Hsp/hit-strand": self._set_hsp_hit_strand, + "end_Hsp/identity": self._set_hsp_identity, + "end_Hsp/positive": self._set_hsp_positive, + "end_Hsp/gaps": self._set_hsp_gaps, + "end_Hsp/align-len": self._set_hsp_align_len, + "end_Hsp/qseq": self._set_hsp_query_seq, + "end_Hsp/hseq": self._set_hsp_subject_seq, + "end_Hsp/midline": self._set_hsp_midline, + "end_Statistics/db-num": self._set_statistics_db_num, + "end_Statistics/db-len": self._set_statistics_db_len, + "end_Statistics/hsp-len": self._set_statistics_hsp_len, + "end_Statistics/eff-space": self._set_statistics_eff_space, + "end_Statistics/kappa": self._set_statistics_kappa, + "end_Statistics/lambda": self._set_statistics_lambda, + "end_Statistics/entropy": self._set_statistics_entropy, + } + + def _start_blast_record(self): + """Start interaction (PRIVATE).""" + self._blast = Record.Blast() + + def _end_blast_record(self): + """End interaction (PRIVATE).""" + # We stored a lot of generic "top level" information + # in self._header (an object of type Record.Header) + self._blast.reference = self._header.reference + self._blast.date = self._header.date + self._blast.version = self._header.version + self._blast.database = self._header.database + self._blast.application = self._header.application + + # These are required for "old" pre 2.2.14 files + # where only , + # and were used. Now they + # are supplemented/replaced by , + # and + if not hasattr(self._blast, "query") or not self._blast.query: + self._blast.query = self._header.query + if not hasattr(self._blast, "query_id") or not self._blast.query_id: + self._blast.query_id = self._header.query_id + if not hasattr(self._blast, "query_letters") or not self._blast.query_letters: + self._blast.query_letters = self._header.query_letters + + # Hack to record the query length as both the query_letters and + # query_length properties (as in the plain text parser, see + # Bug 2176 comment 12): + self._blast.query_length = self._blast.query_letters + # Perhaps in the long term we should deprecate one, but I would + # prefer to drop query_letters - so we need a transition period + # with both. + + # Hack to record the claimed database size as database_length + # (as well as in num_letters_in_database, see Bug 2176 comment 13): + self._blast.database_length = self._blast.num_letters_in_database + # TODO? Deprecate database_letters next? + + # Hack to record the claimed database sequence count as database_sequences + self._blast.database_sequences = self._blast.num_sequences_in_database + + # Apply the "top level" parameter information + self._blast.matrix = self._parameters.matrix + self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e + self._blast.gap_penalties = self._parameters.gap_penalties + self._blast.filter = self._parameters.filter + self._blast.expect = self._parameters.expect + self._blast.sc_match = self._parameters.sc_match + self._blast.sc_mismatch = self._parameters.sc_mismatch + + # Add to the list + self._records.append(self._blast) + # Clear the object (a new empty one is create in _start_Iteration) + self._blast = None + + if self._debug: + print("NCBIXML: Added Blast record to results") + + # Header + def _set_header_application(self): + """BLAST program, e.g., blastp, blastn, etc. (PRIVATE). + + Save this to put on each blast record object + """ + self._header.application = self._value.upper() + + def _set_header_version(self): + """Version number and date of the BLAST engine (PRIVATE). + + e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be + variants like "BLASTP 2.2.18+" without the date. + + Save this to put on each blast record object + """ + parts = self._value.split() + # TODO - Check the first word starts with BLAST? + + # The version is the second word (field one) + self._header.version = parts[1] + + # Check there is a third word (the date) + if len(parts) >= 3: + if parts[2][0] == "[" and parts[2][-1] == "]": + self._header.date = parts[2][1:-1] + else: + # Assume this is still a date, but without the + # square brackets + self._header.date = parts[2] + + def _set_header_reference(self): + """Record any article reference describing the algorithm (PRIVATE). + + Save this to put on each blast record object + """ + self._header.reference = self._value + + def _set_header_database(self): + """Record the database(s) searched (PRIVATE). + + Save this to put on each blast record object + """ + self._header.database = self._value + + def _set_header_query_id(self): + """Record the identifier of the query (PRIVATE). + + Important in old pre 2.2.14 BLAST, for recent versions + is enough + """ + self._header.query_id = self._value + + def _set_header_query(self): + """Record the definition line of the query (PRIVATE). + + Important in old pre 2.2.14 BLAST, for recent versions + is enough + """ + self._header.query = self._value + + def _set_header_query_letters(self): + """Record the length of the query (PRIVATE). + + Important in old pre 2.2.14 BLAST, for recent versions + is enough + """ + self._header.query_letters = int(self._value) + + def _set_record_query_id(self): + """Record the identifier of the query (PRIVATE).""" + self._blast.query_id = self._value + + def _set_record_query_def(self): + """Record the definition line of the query (PRIVATE).""" + self._blast.query = self._value + + def _set_record_query_letters(self): + """Record the length of the query (PRIVATE).""" + self._blast.query_letters = int(self._value) + + # def _end_BlastOutput_query_seq(self): + # """The query sequence (PRIVATE).""" + # pass # XXX Missing in Record.Blast ? + + # def _end_BlastOutput_iter_num(self): + # """The psi-blast iteration number (PRIVATE).""" + # pass # XXX TODO PSI + + def _set_record_hits(self): + """Hits to the database sequences, one for every sequence (PRIVATE).""" + self._blast.num_hits = int(self._value) + + # def _end_BlastOutput_message(self): + # """error messages (PRIVATE).""" + # pass # XXX What to do ? + + # Parameters + def _set_parameters_matrix(self): + """Matrix used (-M on legacy BLAST) (PRIVATE).""" + self._parameters.matrix = self._value + + def _set_parameters_expect(self): + """Expect values cutoff (PRIVATE).""" + # NOTE: In old text output there was a line: + # Number of sequences better than 1.0e-004: 1 + # As far as I can see, parameters.num_seqs_better_e + # would take the value of 1, and the expectation + # value was not recorded. + # + # Anyway we should NOT record this against num_seqs_better_e + self._parameters.expect = self._value + + # def _end_Parameters_include(self): + # """Inclusion threshold for a psi-blast iteration (-h) (PRIVATE).""" + # pass # XXX TODO PSI + + def _set_parameters_sc_match(self): + """Match score for nucleotide-nucleotide comparison (-r) (PRIVATE).""" + self._parameters.sc_match = int(self._value) + + def _set_parameters_sc_mismatch(self): + """Mismatch penalty for nucleotide-nucleotide comparison (-r) (PRIVATE).""" + self._parameters.sc_mismatch = int(self._value) + + def _set_parameters_gap_penalties(self): + """Gap existence cost (-G) (PRIVATE).""" + self._parameters.gap_penalties = int(self._value) + + def _set_parameters_gap_extend(self): + """Gap extension cose (-E) (PRIVATE).""" + self._parameters.gap_penalties = ( + self._parameters.gap_penalties, + int(self._value), + ) + + def _set_parameters_filter(self): + """Record filtering options (-F) (PRIVATE).""" + self._parameters.filter = self._value + + # def _end_Parameters_pattern(self): + # """Pattern used for phi-blast search (PRIVATE). + # """ + # pass # XXX TODO PSI + + # def _end_Parameters_entrez_query(self): + # """Entrez query used to limit search (PRIVATE). + # """ + # pass # XXX TODO PSI + + # Hits + def _start_hit(self): + """Start filling records (PRIVATE).""" + self._blast.alignments.append(Record.Alignment()) + self._descr = ( + Record.Description() if self._xml_version == 1 else Record.DescriptionExt() + ) + self._blast.descriptions.append(self._descr) + self._blast.multiple_alignment = [] + self._hit = self._blast.alignments[-1] + + self._descr.num_alignments = 0 + + def _end_hit(self): + """Clear variables (PRIVATE).""" + # Cleanup + self._blast.multiple_alignment = None + self._hit = None + self._descr = None + + def set_hit_id(self): + """Record the identifier of the database sequence (PRIVATE).""" + self._hit.hit_id = self._value + self._hit.title = self._value + " " + + def set_hit_def(self): + """Record the definition line of the database sequence (PRIVATE).""" + self._hit.hit_def = self._value + self._hit.title += self._value + self._descr.title = self._hit.title + + def set_hit_accession(self): + """Record the accession value of the database sequence (PRIVATE).""" + self._hit.accession = self._value + self._descr.accession = self._value + + def set_hit_len(self): + """Record the length of the hit.""" + self._hit.length = int(self._value) + + # HSPs + def _start_hsp(self): + # Note that self._start_Hit() should have been called + # to setup things like self._blast.multiple_alignment + self._hsp = Record.HSP() + self._hsp.positives = None + self._hit.hsps.append(self._hsp) + self._descr.num_alignments += 1 + self._blast.multiple_alignment.append(Record.MultipleAlignment()) + self._mult_al = self._blast.multiple_alignment[-1] + + def _end_hsp(self): + if self._hsp.frame and len(self._hsp.frame) == 1: + self._hsp.frame += (0,) + + # Hsp_num is useless + def _set_hsp_score(self): + """Record the raw score of HSP (PRIVATE).""" + self._hsp.score = float(self._value) + if self._descr.score is None: + self._descr.score = float(self._value) + + def _set_hsp_bit_score(self): + """Record the Bit score of HSP (PRIVATE).""" + self._hsp.bits = float(self._value) + if self._descr.bits is None: + self._descr.bits = float(self._value) + + def _set_hsp_e_value(self): + """Record the expect value of the HSP (PRIVATE).""" + self._hsp.expect = float(self._value) + if self._descr.e is None: + self._descr.e = float(self._value) + + def _set_hsp_query_start(self): + """Offset of query at the start of the alignment (one-offset) (PRIVATE).""" + self._hsp.query_start = int(self._value) + + def _set_hsp_query_end(self): + """Offset of query at the end of the alignment (one-offset) (PRIVATE).""" + self._hsp.query_end = int(self._value) + + def _set_hsp_hit_from(self): + """Offset of the database at the start of the alignment (one-offset) (PRIVATE).""" + self._hsp.sbjct_start = int(self._value) + + def _set_hsp_hit_to(self): + """Offset of the database at the end of the alignment (one-offset) (PRIVATE).""" + self._hsp.sbjct_end = int(self._value) + + # def _end_Hsp_pattern_from(self): + # """Start of phi-blast pattern on the query (one-offset) (PRIVATE).""" + # pass # XXX TODO PSI + + # def _end_Hsp_pattern_to(self): + # """End of phi-blast pattern on the query (one-offset) (PRIVATE).""" + # pass # XXX TODO PSI + + def _set_hsp_query_frame(self): + """Frame of the query if applicable (PRIVATE).""" + v = int(self._value) + self._hsp.frame = (v,) + if self._header.application == "BLASTN": + self._hsp.strand = ("Plus" if v > 0 else "Minus",) + + def _set_hsp_hit_frame(self): + """Frame of the database sequence if applicable (PRIVATE).""" + v = int(self._value) + if len(self._hsp.frame) == 0: + self._hsp.frame = (0, v) + else: + self._hsp.frame += (v,) + if self._header.application == "BLASTN": + self._hsp.strand += ("Plus" if v > 0 else "Minus",) + + def _set_hsp_query_strand(self): + """Frame of the query if applicable (PRIVATE).""" + self._hsp.strand = (self._value,) + if self._header.application == "BLASTN": + self._hsp.frame = (1 if self._value == "Plus" else -1,) + + def _set_hsp_hit_strand(self): + """Frame of the database sequence if applicable (PRIVATE).""" + self._hsp.strand += (self._value,) + if self._header.application == "BLASTN": + self._hsp.frame += (1 if self._value == "Plus" else -1,) + + def _set_hsp_identity(self): + """Record the number of identities in the alignment (PRIVATE).""" + v = int(self._value) + self._hsp.identities = v + if self._hsp.positives is None: + self._hsp.positives = v + + def _set_hsp_positive(self): + """Record the number of positive (conservative) substitutions in the alignment (PRIVATE).""" + self._hsp.positives = int(self._value) + + def _set_hsp_gaps(self): + """Record the number of gaps in the alignment (PRIVATE).""" + self._hsp.gaps = int(self._value) + + def _set_hsp_align_len(self): + """Record the length of the alignment (PRIVATE).""" + self._hsp.align_length = int(self._value) + + # def _en_Hsp_density(self): + # """Score density (PRIVATE).""" + # pass # XXX ??? + + def _set_hsp_query_seq(self): + """Record the alignment string for the query (PRIVATE).""" + self._hsp.query = self._value + + def _set_hsp_subject_seq(self): + """Record the alignment string for the database (PRIVATE).""" + self._hsp.sbjct = self._value + + def _set_hsp_midline(self): + """Record the middle line as normally seen in BLAST report (PRIVATE).""" + self._hsp.match = self._value # do NOT strip spaces! + assert len(self._hsp.match) == len(self._hsp.query) + assert len(self._hsp.match) == len(self._hsp.sbjct) + + # Statistics + def _set_statistics_db_num(self): + """Record the number of sequences in the database (PRIVATE).""" + self._blast.num_sequences_in_database = int(self._value) + + def _set_statistics_db_len(self): + """Record the number of letters in the database (PRIVATE).""" + self._blast.num_letters_in_database = int(self._value) + + def _set_statistics_hsp_len(self): + """Record the effective HSP length (PRIVATE).""" + self._blast.effective_hsp_length = int(self._value) + + def _set_statistics_eff_space(self): + """Record the effective search space (PRIVATE).""" + self._blast.effective_search_space = float(self._value) + + def _set_statistics_kappa(self): + """Karlin-Altschul parameter K (PRIVATE).""" + self._blast.ka_params = float(self._value) + + def _set_statistics_lambda(self): + """Karlin-Altschul parameter Lambda (PRIVATE).""" + self._blast.ka_params = (float(self._value), self._blast.ka_params) + + def _set_statistics_entropy(self): + """Karlin-Altschul parameter H (PRIVATE).""" + self._blast.ka_params = self._blast.ka_params + (float(self._value),) + + def _start_hit_descr_item(self): + """XML v2. Start hit description item.""" + self._hit_descr_item = Record.DescriptionExtItem() + + def _end_hit_descr_item(self): + """XML v2. Start hit description item.""" + self._descr.append_item(self._hit_descr_item) + if not self._hit.title: + self._hit.title = str(self._hit_descr_item) + self._hit_descr_item = None + + def _end_description_id(self): + """XML v2. The identifier of the database sequence(PRIVATE).""" + self._hit_descr_item.id = self._value + if not self._hit.hit_id: + self._hit.hit_id = self._value + + def _end_description_accession(self): + """XML v2. The accession value of the database sequence (PRIVATE).""" + self._hit_descr_item.accession = self._value + if not getattr(self._hit, "accession", None): + self._hit.accession = self._value + + def _end_description_title(self): + """XML v2. The hit description title (PRIVATE).""" + self._hit_descr_item.title = self._value + + def _end_description_taxid(self): + try: + self._hit_descr_item.taxid = int(self._value) + except ValueError: + pass + + def _end_description_sciname(self): + self._hit_descr_item.sciname = self._value + + +def read(handle, debug=0): + """Return a single Blast record (assumes just one query). + + Uses the BlastParser internally. + + This function is for use when there is one and only one BLAST + result in your XML file. + + Use the Bio.Blast.NCBIXML.parse() function if you expect more than + one BLAST record (i.e. if you have more than one query sequence). + """ + iterator = parse(handle, debug) + try: + record = next(iterator) + except StopIteration: + raise ValueError("No records found in handle") from None + try: + next(iterator) + raise ValueError("More than one record found in handle") + except StopIteration: + pass + return record + + +def parse(handle, debug=0): + """Return an iterator a Blast record for each query. + + Incremental parser, this is an iterator that returns + Blast records. It uses the BlastParser internally. + + handle - file handle to and XML file to parse + debug - integer, amount of debug information to print + + This is a generator function that returns multiple Blast records + objects - one for each query sequence given to blast. The file + is read incrementally, returning complete records as they are read + in. + + Should cope with new BLAST 2.2.14+ which gives a single XML file + for multiple query records. + + Should also cope with XML output from older versions BLAST which + gave multiple XML files concatenated together (giving a single file + which strictly speaking wasn't valid XML). + """ + from xml.parsers import expat + + BLOCK = 1024 + MARGIN = 10 # must be at least length of newline + XML start + XML_START = ""): + """Ensure the given value formats to a string correctly.""" + if value is None: + return default_str + return format_spec % value + + +class Header: + """Saves information from a blast header. + + Members: + application The name of the BLAST flavor that generated this data. + version Version of blast used. + date Date this data was generated. + reference Reference for blast. + + query Name of query sequence. + query_letters Number of letters in the query sequence. (int) + + database Name of the database. + database_sequences Number of sequences in the database. (int) + database_letters Number of letters in the database. (int) + + """ + + def __init__(self): + """Initialize the class.""" + self.application = "" + self.version = "" + self.date = "" + self.reference = "" + + self.query = "" + self.query_letters = None + + self.database = "" + self.database_sequences = None + self.database_letters = None + + +class Description: + """Stores information about one hit in the descriptions section. + + Members: + title Title of the hit. + score Number of bits. (int) + bits Bit score. (float) + e E value. (float) + num_alignments Number of alignments for the same subject. (int) + """ + + def __init__(self): + """Initialize the class.""" + self.title = "" + self.score = None + self.bits = None + self.e = None + self.num_alignments = None + + def __str__(self): + """Return the description as a string.""" + return "%-66s %5s %s" % (self.title, self.score, self.e) + + +class DescriptionExt(Description): + """Extended description record for BLASTXML version 2. + + Members: + items List of DescriptionExtItem + """ + + def __init__(self): + """Initialize the class.""" + super().__init__() + + self.items = [] + + def append_item(self, item): + """Add a description extended record.""" + if len(self.items) == 0: + self.title = str(item) + self.items.append(item) + + +class DescriptionExtItem: + """Stores information about one record in hit description for BLASTXML version 2. + + Members: + id Database identifier + title Title of the hit. + """ + + def __init__(self): + """Initialize the class.""" + self.id = None + self.title = None + self.accession = None + self.taxid = None + self.sciname = None + + def __str__(self): + """Return the description identifier and title as a string.""" + return "%s %s" % (self.id, self.title) + + +class Alignment: + """Stores information about one hit in the alignments section. + + Members: + title Name. + hit_id Hit identifier. (str) + hit_def Hit definition. (str) + length Length. (int) + hsps A list of HSP objects. + + """ + + def __init__(self): + """Initialize the class.""" + self.title = "" + self.hit_id = "" + self.hit_def = "" + self.length = None + self.hsps = [] + + def __str__(self): + """Return the BLAST alignment as a formatted string.""" + lines = self.title.split("\n") + lines.append("Length = %s\n" % self.length) + return "\n ".join(lines) + + +class HSP: + """Stores information about one hsp in an alignment hit. + + Members: + - score BLAST score of hit. (float) + - bits Number of bits for that score. (float) + - expect Expect value. (float) + - num_alignments Number of alignments for same subject. (int) + - identities Number of identities (int) if using the XML parser. + Tuple of number of identities/total aligned (int, int) + if using the (obsolete) plain text parser. + - positives Number of positives (int) if using the XML parser. + Tuple of number of positives/total aligned (int, int) + if using the (obsolete) plain text parser. + - gaps Number of gaps (int) if using the XML parser. + Tuple of number of gaps/total aligned (int, int) if + using the (obsolete) plain text parser. + - align_length Length of the alignment. (int) + - strand Tuple of (query, target) strand. + - frame Tuple of 1 or 2 frame shifts, depending on the flavor. + + - query The query sequence. + - query_start The start residue for the query sequence. (1-based) + - query_end The end residue for the query sequence. (1-based) + - match The match sequence. + - sbjct The sbjct sequence. + - sbjct_start The start residue for the sbjct sequence. (1-based) + - sbjct_end The end residue for the sbjct sequence. (1-based) + + Not all flavors of BLAST return values for every attribute:: + + score expect identities positives strand frame + BLASTP X X X X + BLASTN X X X X X + BLASTX X X X X X + TBLASTN X X X X X + TBLASTX X X X X X/X + + Note: for BLASTX, the query sequence is shown as a protein sequence, + but the numbering is based on the nucleotides. Thus, the numbering + is 3x larger than the number of amino acid residues. A similar effect + can be seen for the sbjct sequence in TBLASTN, and for both sequences + in TBLASTX. + + Also, for negative frames, the sequence numbering starts from + query_start and counts down. + + """ + + def __init__(self): + """Initialize the class.""" + self.score = None + self.bits = None + self.expect = None + self.num_alignments = None + self.identities = (None, None) + self.positives = (None, None) + self.gaps = (None, None) + self.align_length = None + self.strand = (None, None) + self.frame = () + + self.query = "" + self.query_start = None + self.query_end = None + self.match = "" + self.sbjct = "" + self.sbjct_start = None + self.sbjct_end = None + + def __str__(self): + """Return the BLAST HSP as a formatted string.""" + lines = [ + "Score %s (%s bits), expectation %s, alignment length %s" + % ( + fmt_(self.score, "%i"), + fmt_(self.bits, "%i"), + fmt_(self.expect, "%0.1e"), + fmt_(self.align_length, "%i"), + ) + ] + if self.align_length is None: + return "\n".join(lines) + if self.align_length < 50: + lines.append( + "Query:%8s %s %s" % (self.query_start, self.query, self.query_end) + ) + lines.append(" %s" % self.match) + lines.append( + "Sbjct:%8s %s %s" % (self.sbjct_start, self.sbjct, self.sbjct_end) + ) + else: + lines.append( + "Query:%8s %s...%s %s" + % (self.query_start, self.query[:45], self.query[-3:], self.query_end,) + ) + lines.append(" %s...%s" % (self.match[:45], self.match[-3:])) + lines.append( + "Sbjct:%8s %s...%s %s" + % (self.sbjct_start, self.sbjct[:45], self.sbjct[-3:], self.sbjct_end) + ) + return "\n".join(lines) + + +class MultipleAlignment: + """Holds information about a multiple alignment. + + Members: + alignment A list of tuples (name, start residue, sequence, end residue). + + The start residue is 1-based. It may be blank, if that sequence is + not aligned in the multiple alignment. + + """ + + def __init__(self): + """Initialize the class.""" + self.alignment = [] + + def to_generic(self): + """Retrieve generic alignment object for the given alignment. + + Instead of the tuples, this returns a MultipleSeqAlignment object + from Bio.Align, through which you can manipulate and query + the object. + + Thanks to James Casbon for the code. + """ + seq_parts = [] + seq_names = [] + parse_number = 0 + n = 0 + for name, start, seq, end in self.alignment: + if name == "QUERY": # QUERY is the first in each alignment block + parse_number += 1 + n = 0 + + if parse_number == 1: # create on first_parse, append on all others + seq_parts.append(seq) + seq_names.append(name) + else: + seq_parts[n] += seq + n += 1 + + records = ( + SeqRecord(Seq(seq), name) for (name, seq) in zip(seq_names, seq_parts) + ) + return MultipleSeqAlignment(records) + + +class Round: + """Holds information from a PSI-BLAST round. + + Members: + number Round number. (int) + reused_seqs Sequences in model, found again. List of Description objects. + new_seqs Sequences not found, or below threshold. List of Description. + alignments A list of Alignment objects. + multiple_alignment A MultipleAlignment object. + """ + + def __init__(self): + """Initialize the class.""" + self.number = None + self.reused_seqs = [] + self.new_seqs = [] + self.alignments = [] + self.multiple_alignment = None + + +class DatabaseReport: + """Holds information about a database report. + + Members: + database_name List of database names. (can have multiple dbs) + num_letters_in_database Number of letters in the database. (int) + num_sequences_in_database List of number of sequences in the database. + posted_date List of the dates the databases were posted. + ka_params A tuple of (lambda, k, h) values. (floats) + gapped # XXX this isn't set right! + ka_params_gap A tuple of (lambda, k, h) values. (floats) + + """ + + def __init__(self): + """Initialize the class.""" + self.database_name = [] + self.posted_date = [] + self.num_letters_in_database = [] + self.num_sequences_in_database = [] + self.ka_params = (None, None, None) + self.gapped = 0 + self.ka_params_gap = (None, None, None) + + +class Parameters: + """Holds information about the parameters. + + Members: + matrix Name of the matrix. + gap_penalties Tuple of (open, extend) penalties. (floats) + sc_match Match score for nucleotide-nucleotide comparison + sc_mismatch Mismatch penalty for nucleotide-nucleotide comparison + num_hits Number of hits to the database. (int) + num_sequences Number of sequences. (int) + num_good_extends Number of extensions. (int) + num_seqs_better_e Number of sequences better than e-value. (int) + hsps_no_gap Number of HSP's better, without gapping. (int) + hsps_prelim_gapped Number of HSP's gapped in prelim test. (int) + hsps_prelim_gapped_attemped Number of HSP's attempted in prelim. (int) + hsps_gapped Total number of HSP's gapped. (int) + query_length Length of the query. (int) + query_id Identifier of the query sequence. (str) + database_length Number of letters in the database. (int) + effective_hsp_length Effective HSP length. (int) + effective_query_length Effective length of query. (int) + effective_database_length Effective length of database. (int) + effective_search_space Effective search space. (int) + effective_search_space_used Effective search space used. (int) + frameshift Frameshift window. Tuple of (int, float) + threshold Threshold. (int) + window_size Window size. (int) + dropoff_1st_pass Tuple of (score, bits). (int, float) + gap_x_dropoff Tuple of (score, bits). (int, float) + gap_x_dropoff_final Tuple of (score, bits). (int, float) + gap_trigger Tuple of (score, bits). (int, float) + blast_cutoff Tuple of (score, bits). (int, float) + """ + + def __init__(self): + """Initialize the class.""" + self.matrix = "" + self.gap_penalties = (None, None) + self.sc_match = None + self.sc_mismatch = None + self.num_hits = None + self.num_sequences = None + self.num_good_extends = None + self.num_seqs_better_e = None + self.hsps_no_gap = None + self.hsps_prelim_gapped = None + self.hsps_prelim_gapped_attemped = None + self.hsps_gapped = None + self.query_id = None + self.query_length = None + self.database_length = None + self.effective_hsp_length = None + self.effective_query_length = None + self.effective_database_length = None + self.effective_search_space = None + self.effective_search_space_used = None + self.frameshift = (None, None) + self.threshold = None + self.window_size = None + self.dropoff_1st_pass = (None, None) + self.gap_x_dropoff = (None, None) + self.gap_x_dropoff_final = (None, None) + self.gap_trigger = (None, None) + self.blast_cutoff = (None, None) + + +# TODO - Add a friendly __str__ method to BLAST results +class Blast(Header, DatabaseReport, Parameters): + """Saves the results from a blast search. + + Members: + descriptions A list of Description objects. + alignments A list of Alignment objects. + multiple_alignment A MultipleAlignment object. + + members inherited from base classes + + """ + + def __init__(self): + """Initialize the class.""" + Header.__init__(self) + DatabaseReport.__init__(self) + Parameters.__init__(self) + self.descriptions = [] + self.alignments = [] + self.multiple_alignment = None + + +class PSIBlast(Header, DatabaseReport, Parameters): + """Saves the results from a blastpgp search. + + Members: + rounds A list of Round objects. + converged Whether the search converged. + + members inherited from base classes + + """ + + def __init__(self): + """Initialize the class.""" + Header.__init__(self) + DatabaseReport.__init__(self) + Parameters.__init__(self) + self.rounds = [] + self.converged = 0 diff --git a/code/lib/Bio/Blast/__init__.py b/code/lib/Bio/Blast/__init__.py new file mode 100644 index 0000000..27c0ec3 --- /dev/null +++ b/code/lib/Bio/Blast/__init__.py @@ -0,0 +1,7 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Code for dealing with BLAST programs and output.""" diff --git a/code/lib/Bio/Blast/__pycache__/Applications.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/Applications.cpython-37.pyc new file mode 100644 index 0000000..44e9314 Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/Applications.cpython-37.pyc differ diff --git a/code/lib/Bio/Blast/__pycache__/NCBIWWW.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/NCBIWWW.cpython-37.pyc new file mode 100644 index 0000000..f6823c0 Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/NCBIWWW.cpython-37.pyc differ diff --git a/code/lib/Bio/Blast/__pycache__/NCBIXML.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/NCBIXML.cpython-37.pyc new file mode 100644 index 0000000..18c8311 Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/NCBIXML.cpython-37.pyc differ diff --git a/code/lib/Bio/Blast/__pycache__/ParseBlastTable.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/ParseBlastTable.cpython-37.pyc new file mode 100644 index 0000000..e31274f Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/ParseBlastTable.cpython-37.pyc differ diff --git a/code/lib/Bio/Blast/__pycache__/Record.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/Record.cpython-37.pyc new file mode 100644 index 0000000..37eead1 Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/Record.cpython-37.pyc differ diff --git a/code/lib/Bio/Blast/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..9e31daf Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/CAPS/__init__.py b/code/lib/Bio/CAPS/__init__.py new file mode 100644 index 0000000..2c28e8f --- /dev/null +++ b/code/lib/Bio/CAPS/__init__.py @@ -0,0 +1,135 @@ +# Copyright 2005 by Jonathan Taylor. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Cleaved amplified polymorphic sequence (CAPS) markers. + +A CAPS marker is a location a DifferentialCutsite as described below and a +set of primers that can be used to visualize this. More information can +be found in the paper `Konieczny and Ausubel (1993)`_ (PMID 8106085). + +.. _`Konieczny and Ausubel (1993)`: https://doi.org/10.1046/j.1365-313X.1993.04020403.x + +""" + + +class DifferentialCutsite: + """Differential enzyme cutsite in an alignment. + + A differential cutsite is a location in an alignment where an enzyme cuts + at least one sequence and also cannot cut at least one other sequence. + + Members: + - start - Where it lives in the alignment. + - enzyme - The enzyme that causes this. + - cuts_in - A list of sequences (as indexes into the alignment) the + enzyme cuts in. + - blocked_in - A list of sequences (as indexes into the alignment) the + enzyme is blocked in. + + """ + + def __init__(self, **kwds): + """Initialize a DifferentialCutsite. + + Each member (as listed in the class description) should be included as a + keyword. + """ + self.start = int(kwds["start"]) + self.enzyme = kwds["enzyme"] + self.cuts_in = kwds["cuts_in"] + self.blocked_in = kwds["blocked_in"] + + +class AlignmentHasDifferentLengthsError(Exception): + """Exception where sequences in alignment have different lengths.""" + + pass + + +class CAPSMap: + """A map of an alignment showing all possible dcuts. + + Members: + - alignment - The alignment that is mapped. + - dcuts - A list of possible CAPS markers in the form of + DifferentialCutsites. + + """ + + def __init__(self, alignment, enzymes=None): + """Initialize the CAPSMap. + + Required: + - alignment - The alignment to be mapped. + + Optional: + - enzymes - List of enzymes to be used to create the map. + Defaults to an empty list. + + """ + if enzymes is None: + enzymes = [] + self.sequences = [rec.seq for rec in alignment] + self.size = len(self.sequences) + self.length = len(self.sequences[0]) + for seq in self.sequences: + if len(seq) != self.length: + raise AlignmentHasDifferentLengthsError + + self.alignment = alignment + self.enzymes = enzymes + + # look for dcuts + self._digest() + + def _digest_with(self, enzyme): + cuts = [] # list of lists, one per sequence + all = [] + + # go through each sequence + for seq in self.sequences: + # grab all the cuts in the sequence + seq_cuts = [cut - enzyme.fst5 for cut in enzyme.search(seq)] + # maintain a list of all cuts in all sequences + all.extend(seq_cuts) + cuts.append(seq_cuts) + + # we sort the all list and remove duplicates + all.sort() + + last = -999 + new = [] + for cut in all: + if cut != last: + new.append(cut) + last = cut + all = new + # all now has indices for all sequences in the alignment + + for cut in all: + # test for dcuts + + cuts_in = [] + blocked_in = [] + + for i in range(0, self.size): + seq = self.sequences[i] + if cut in cuts[i]: + cuts_in.append(i) + else: + blocked_in.append(i) + + if cuts_in != [] and blocked_in != []: + self.dcuts.append( + DifferentialCutsite( + start=cut, enzyme=enzyme, cuts_in=cuts_in, blocked_in=blocked_in + ) + ) + + def _digest(self): + self.dcuts = [] + + for enzyme in self.enzymes: + self._digest_with(enzyme) diff --git a/code/lib/Bio/CAPS/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/CAPS/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..06df99c Binary files /dev/null and b/code/lib/Bio/CAPS/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Cluster/__init__.py b/code/lib/Bio/Cluster/__init__.py new file mode 100644 index 0000000..32444da --- /dev/null +++ b/code/lib/Bio/Cluster/__init__.py @@ -0,0 +1,1293 @@ +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# +"""Cluster Analysis. + +The Bio.Cluster provides commonly used clustering algorithms and was +designed with the application to gene expression data in mind. However, +this module can also be used for cluster analysis of other types of data. + +Bio.Cluster and the underlying C Clustering Library is described in +M. de Hoon et al. (2004) https://doi.org/10.1093/bioinformatics/bth078 +""" + +import numbers + +try: + import numpy +except ImportError: + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError( + "Please install numpy if you want to use Bio.Cluster. " + "See http://www.numpy.org/" + ) from None + +from . import _cluster + +__all__ = ( + "Node", + "Tree", + "kcluster", + "kmedoids", + "treecluster", + "somcluster", + "clusterdistance", + "clustercentroids", + "distancematrix", + "pca", + "Record", + "read", +) + + +__version__ = _cluster.version() + + +class Node(_cluster.Node): + """Element of a hierarchical clustering tree. + + A node contains items or other Nodes(sub-nodes). + """ + + __doc__ = _cluster.Node.__doc__ + + +class Tree(_cluster.Tree): + """Hierarchical clustering tree. + + A Tree consists of Nodes. + """ + + def sort(self, order=None): + """Sort the hierarchical clustering tree. + + Sort the hierarchical clustering tree by switching the left and + right subnode of nodes such that the elements in the left-to-right + order of the tree tend to have increasing order values. + + Return the indices of the elements in the left-to-right order in + the hierarchical clustering tree, such that the element with index + indices[i] occurs at position i in the dendrogram. + + """ + n = len(self) + 1 + indices = numpy.ones(n, dtype="intc") + if order is None: + order = numpy.ones(n, dtype="d") + elif isinstance(order, numpy.ndarray): + order = numpy.require(order, dtype="d", requirements="C") + else: + order = numpy.array(order, dtype="d") + _cluster.Tree.sort(self, indices, order) + return indices + + def cut(self, nclusters=None): + """Create clusters by cutting the hierarchical clustering tree. + + Divide the elements in a hierarchical clustering result mytree + into clusters, and return an array with the number of the cluster + to which each element was assigned. + + Keyword arguments: + - nclusters: The desired number of clusters. + """ + n = len(self) + 1 + indices = numpy.ones(n, dtype="intc") + if nclusters is None: + nclusters = n + _cluster.Tree.cut(self, indices, nclusters) + return indices + + +def kcluster( + data, + nclusters=2, + mask=None, + weight=None, + transpose=False, + npass=1, + method="a", + dist="e", + initialid=None, +): + """Perform k-means clustering. + + This function performs k-means clustering on the values in data, and + returns the cluster assignments, the within-cluster sum of distances + of the optimal k-means clustering solution, and the number of times + the optimal solution was found. + + Keyword arguments: + - data: nrows x ncolumns array containing the data values. + - nclusters: number of clusters (the 'k' in k-means). + - mask: nrows x ncolumns array of integers, showing which data + are missing. If mask[i,j]==0, then data[i,j] is missing. + - weight: the weights to be used when calculating distances + - transpose: + - if False: rows are clustered; + - if True: columns are clustered. + - npass: number of times the k-means clustering algorithm is + performed, each time with a different (random) initial + condition. + - method: specifies how the center of a cluster is found: + - method == 'a': arithmetic mean; + - method == 'm': median. + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance; + - dist == 'b': City Block distance; + - dist == 'c': Pearson correlation; + - dist == 'a': absolute value of the correlation; + - dist == 'u': uncentered correlation; + - dist == 'x': absolute uncentered correlation; + - dist == 's': Spearman's rank correlation; + - dist == 'k': Kendall's tau. + - initialid: the initial clustering from which the algorithm + should start. + If initialid is None, the routine carries out npass + repetitions of the EM algorithm, each time starting from a + different random initial clustering. If initialid is given, + the routine carries out the EM algorithm only once, starting + from the given initial clustering and without randomizing the + order in which items are assigned to clusters (i.e., using + the same order as in the data matrix). In that case, the + k-means algorithm is fully deterministic. + + Return values: + - clusterid: array containing the number of the cluster to which each + item was assigned in the best k-means clustering solution that was + found in the npass runs; + - error: the within-cluster sum of distances for the returned k-means + clustering solution; + - nfound: the number of times this solution was found. + """ + data = __check_data(data) + shape = data.shape + if transpose: + ndata, nitems = shape + else: + nitems, ndata = shape + mask = __check_mask(mask, shape) + weight = __check_weight(weight, ndata) + clusterid, npass = __check_initialid(initialid, npass, nitems) + error, nfound = _cluster.kcluster( + data, nclusters, mask, weight, transpose, npass, method, dist, clusterid + ) + return clusterid, error, nfound + + +def kmedoids(distance, nclusters=2, npass=1, initialid=None): + """Perform k-medoids clustering. + + This function performs k-medoids clustering, and returns the cluster + assignments, the within-cluster sum of distances of the optimal + k-medoids clustering solution, and the number of times the optimal + solution was found. + + Keyword arguments: + - distance: The distance matrix between the items. There are three + ways in which you can pass a distance matrix: + 1. a 2D Numerical Python array (in which only the left-lower + part of the array will be accessed); + 2. a 1D Numerical Python array containing the distances + consecutively; + 3. a list of rows containing the lower-triangular part of + the distance matrix. + + Examples are: + + >>> from numpy import array + >>> # option 1: + >>> distance = array([[0.0, 1.1, 2.3], + ... [1.1, 0.0, 4.5], + ... [2.3, 4.5, 0.0]]) + >>> # option 2: + >>> distance = array([1.1, 2.3, 4.5]) + >>> # option 3: + >>> distance = [array([]), + ... array([1.1]), + ... array([2.3, 4.5])] + + + These three correspond to the same distance matrix. + - nclusters: number of clusters (the 'k' in k-medoids) + - npass: the number of times the k-medoids clustering algorithm + is performed, each time with a different (random) initial + condition. + - initialid: the initial clustering from which the algorithm should start. + If initialid is not given, the routine carries out npass + repetitions of the EM algorithm, each time starting from a + different random initial clustering. If initialid is given, + the routine carries out the EM algorithm only once, starting + from the initial clustering specified by initialid and + without randomizing the order in which items are assigned to + clusters (i.e., using the same order as in the data matrix). + In that case, the k-medoids algorithm is fully deterministic. + + Return values: + - clusterid: array containing the number of the cluster to which each + item was assigned in the best k-means clustering solution that was + found in the npass runs; + - error: the within-cluster sum of distances for the returned k-means + clustering solution; + - nfound: the number of times this solution was found. + """ + distance = __check_distancematrix(distance) + nitems = len(distance) + clusterid, npass = __check_initialid(initialid, npass, nitems) + error, nfound = _cluster.kmedoids(distance, nclusters, npass, clusterid) + return clusterid, error, nfound + + +def treecluster( + data, + mask=None, + weight=None, + transpose=False, + method="m", + dist="e", + distancematrix=None, +): + """Perform hierarchical clustering, and return a Tree object. + + This function implements the pairwise single, complete, centroid, and + average linkage hierarchical clustering methods. + + Keyword arguments: + - data: nrows x ncolumns array containing the data values. + - mask: nrows x ncolumns array of integers, showing which data are + missing. If mask[i][j]==0, then data[i][j] is missing. + - weight: the weights to be used when calculating distances. + - transpose: + - if False, rows are clustered; + - if True, columns are clustered. + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + - method: specifies which linkage method is used: + - method == 's': Single pairwise linkage + - method == 'm': Complete (maximum) pairwise linkage (default) + - method == 'c': Centroid linkage + - method == 'a': Average pairwise linkage + - distancematrix: The distance matrix between the items. There are + three ways in which you can pass a distance matrix: + 1. a 2D Numerical Python array (in which only the left-lower + part of the array will be accessed); + 2. a 1D Numerical Python array containing the distances + consecutively; + 3. a list of rows containing the lower-triangular part of + the distance matrix. + + Examples are: + + >>> from numpy import array + >>> # option 1: + >>> distance = array([[0.0, 1.1, 2.3], + ... [1.1, 0.0, 4.5], + ... [2.3, 4.5, 0.0]]) + >>> # option 2: + >>> distance = array([1.1, 2.3, 4.5]) + >>> # option 3: + >>> distance = [array([]), + ... array([1.1]), + ... array([2.3, 4.5])] + + These three correspond to the same distance matrix. + + PLEASE NOTE: + As the treecluster routine may shuffle the values in the + distance matrix as part of the clustering algorithm, be sure + to save this array in a different variable before calling + treecluster if you need it later. + + Either data or distancematrix should be None. If distancematrix is None, + the hierarchical clustering solution is calculated from the values stored + in the argument data. If data is None, the hierarchical clustering solution + is instead calculated from the distance matrix. Pairwise centroid-linkage + clustering can be performed only from the data values and not from the + distance matrix. Pairwise single-, maximum-, and average-linkage clustering + can be calculated from the data values or from the distance matrix. + + Return value: + treecluster returns a Tree object describing the hierarchical clustering + result. See the description of the Tree class for more information. + """ + if data is None and distancematrix is None: + raise ValueError("use either data or distancematrix") + if data is not None and distancematrix is not None: + raise ValueError("use either data or distancematrix; do not use both") + if data is not None: + data = __check_data(data) + shape = data.shape + ndata = shape[0] if transpose else shape[1] + mask = __check_mask(mask, shape) + weight = __check_weight(weight, ndata) + if distancematrix is not None: + distancematrix = __check_distancematrix(distancematrix) + if mask is not None: + raise ValueError("mask is ignored if distancematrix is used") + if weight is not None: + raise ValueError("weight is ignored if distancematrix is used") + tree = Tree() + _cluster.treecluster( + tree, data, mask, weight, transpose, method, dist, distancematrix + ) + return tree + + +def somcluster( + data, + mask=None, + weight=None, + transpose=False, + nxgrid=2, + nygrid=1, + inittau=0.02, + niter=1, + dist="e", +): + """Calculate a Self-Organizing Map. + + This function implements a Self-Organizing Map on a rectangular grid. + + Keyword arguments: + - data: nrows x ncolumns array containing the data values; + - mask: nrows x ncolumns array of integers, showing which data are + missing. If mask[i][j]==0, then data[i][j] is missing. + - weight: the weights to be used when calculating distances + - transpose: + - if False: rows are clustered; + - if True: columns are clustered. + - nxgrid: the horizontal dimension of the rectangular SOM map + - nygrid: the vertical dimension of the rectangular SOM map + - inittau: the initial value of tau (the neighborbood function) + - niter: the number of iterations + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + + Return values: + + - clusterid: array with two columns, with the number of rows equal to + the items that are being clustered. Each row in the array contains + the x and y coordinates of the cell in the rectangular SOM grid to + which the item was assigned. + - celldata: an array with dimensions [nxgrid, nygrid, number of columns] + if rows are being clustered, or [nxgrid, nygrid, number of rows) if + columns are being clustered. + Each element [ix, iy] of this array is a 1D vector containing the + data values for the centroid of the cluster in the SOM grid cell + with coordinates [ix, iy]. + """ + if transpose: + ndata, nitems = data.shape + else: + nitems, ndata = data.shape + data = __check_data(data) + shape = data.shape + mask = __check_mask(mask, shape) + weight = __check_weight(weight, ndata) + if nxgrid < 1: + raise ValueError("nxgrid should be a positive integer (default is 2)") + if nygrid < 1: + raise ValueError("nygrid should be a positive integer (default is 1)") + clusterids = numpy.ones((nitems, 2), dtype="intc") + celldata = numpy.empty((nxgrid, nygrid, ndata), dtype="d") + _cluster.somcluster( + clusterids, celldata, data, mask, weight, transpose, inittau, niter, dist + ) + return clusterids, celldata + + +def clusterdistance( + data, + mask=None, + weight=None, + index1=None, + index2=None, + method="a", + dist="e", + transpose=False, +): + """Calculate and return the distance between two clusters. + + Keyword arguments: + - data: nrows x ncolumns array containing the data values. + - mask: nrows x ncolumns array of integers, showing which data are + missing. If mask[i, j]==0, then data[i, j] is missing. + - weight: the weights to be used when calculating distances + - index1: 1D array identifying which items belong to the + first cluster. If the cluster contains only one item, then + index1 can also be written as a single integer. + - index2: 1D array identifying which items belong to the + second cluster. If the cluster contains only one item, then + index2 can also be written as a single integer. + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + - method: specifies how the distance between two clusters is defined: + - method == 'a': the distance between the arithmetic means + of the two clusters + - method == 'm': the distance between the medians of the two clusters + - method == 's': the smallest pairwise distance between members + of the two clusters + - method == 'x': the largest pairwise distance between members + of the two clusters + - method == 'v': average of the pairwise distances between members + of the two clusters + - transpose: + - if False: clusters of rows are considered; + - if True: clusters of columns are considered. + """ + data = __check_data(data) + shape = data.shape + ndata = shape[0] if transpose else shape[1] + mask = __check_mask(mask, shape) + weight = __check_weight(weight, ndata) + index1 = __check_index(index1) + index2 = __check_index(index2) + return _cluster.clusterdistance( + data, mask, weight, index1, index2, method, dist, transpose + ) + + +def clustercentroids(data, mask=None, clusterid=None, method="a", transpose=False): + """Calculate and return the centroid of each cluster. + + The clustercentroids routine calculates the cluster centroids, given to + which cluster each item belongs. The centroid is defined as either + the mean or the median over all items for each dimension. + + Keyword arguments: + - data: nrows x ncolumns array containing the data values. + - mask: nrows x ncolumns array of integers, showing which data are + missing. If mask[i, j]==0, then data[i, j] is missing. + - clusterid: array containing the cluster number for each item. + The cluster number should be non-negative. + - method: specifies whether the centroid is calculated from the + arithmetic mean (method == 'a', default) or the median (method == 'm') + over each dimension. + - transpose: if False, each row contains the data for one item; + if True, each column contains the data for one item. + + Return values: + - cdata: 2D array containing the cluster centroids. + If transpose is False, then the dimensions of cdata are + nclusters x ncolumns. + If transpose is True, then the dimensions of cdata are + nrows x nclusters. + - cmask: 2D array of integers describing which items in cdata, + if any, are missing. + """ + data = __check_data(data) + mask = __check_mask(mask, data.shape) + nrows, ncolumns = data.shape + if clusterid is None: + n = ncolumns if transpose else nrows + clusterid = numpy.zeros(n, dtype="intc") + nclusters = 1 + else: + clusterid = numpy.require(clusterid, dtype="intc", requirements="C") + nclusters = max(clusterid + 1) + if transpose: + shape = (nrows, nclusters) + else: + shape = (nclusters, ncolumns) + cdata = numpy.zeros(shape, dtype="d") + cmask = numpy.zeros(shape, dtype="intc") + _cluster.clustercentroids(data, mask, clusterid, method, transpose, cdata, cmask) + return cdata, cmask + + +def distancematrix(data, mask=None, weight=None, transpose=False, dist="e"): + """Calculate and return a distance matrix from the data. + + This function returns the distance matrix calculated from the data. + + Keyword arguments: + - data: nrows x ncolumns array containing the data values. + - mask: nrows x ncolumns array of integers, showing which data are + missing. If mask[i, j]==0, then data[i, j] is missing. + - weight: the weights to be used when calculating distances. + - transpose: if False: the distances between rows are calculated; + if True: the distances between columns are calculated. + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + + Return value: + The distance matrix is returned as a list of 1D arrays containing the + distance matrix calculated from the data. The number of columns in eac + row is equal to the row number. Hence, the first row has zero length. + For example: + + >>> from numpy import array + >>> from Bio.Cluster import distancematrix + >>> data = array([[0, 1, 2, 3], + ... [4, 5, 6, 7], + ... [8, 9, 10, 11], + ... [1, 2, 3, 4]]) + >>> distances = distancematrix(data, dist='e') + >>> distances + [array([], dtype=float64), array([ 16.]), array([ 64., 16.]), array([ 1., 9., 49.])] + + which can be rewritten as + distances = [array([], dtype=float64), + array([ 16.]), + array([ 64., 16.]), + array([ 1., 9., 49.])] + + This corresponds to the distance matrix: + + [ 0., 16., 64., 1.] + [16., 0., 16., 9.] + [64., 16., 0., 49.] + [ 1., 9., 49., 0.] + """ + data = __check_data(data) + shape = data.shape + mask = __check_mask(mask, shape) + if transpose: + ndata, nitems = shape + else: + nitems, ndata = shape + weight = __check_weight(weight, ndata) + matrix = [numpy.empty(i, dtype="d") for i in range(nitems)] + _cluster.distancematrix(data, mask, weight, transpose, dist, matrix) + return matrix + + +def pca(data): + """Perform principal component analysis. + + Keyword arguments: + - data: nrows x ncolumns array containing the data values. + + Return value: + This function returns an array containing the mean of each column, the + principal components as an nmin x ncolumns array, as well as the + coordinates (an nrows x nmin array) of the data along the principal + components, and the associated eigenvalues. The principal components, the + coordinates, and the eigenvalues are sorted by the magnitude of the + eigenvalue, with the largest eigenvalues appearing first. Here, nmin is + the smaller of nrows and ncolumns. + Adding the column means to the dot product of the coordinates and the + principal components recreates the data matrix: + + >>> from numpy import array, dot, amax, amin + >>> from Bio.Cluster import pca + >>> matrix = array([[ 0., 0., 0.], + ... [ 1., 0., 0.], + ... [ 7., 3., 0.], + ... [ 4., 2., 6.]]) + >>> columnmean, coordinates, pc, _ = pca(matrix) + >>> m = matrix - (columnmean + dot(coordinates, pc)) + >>> amax(m) < 1e-12 and amin(m) > -1e-12 + True + + """ + data = __check_data(data) + nrows, ncols = data.shape + nmin = min(nrows, ncols) + columnmean = numpy.empty(ncols, dtype="d") + pc = numpy.empty((nmin, ncols), dtype="d") + coordinates = numpy.empty((nrows, nmin), dtype="d") + eigenvalues = numpy.empty(nmin, dtype="d") + _cluster.pca(data, columnmean, coordinates, pc, eigenvalues) + return columnmean, coordinates, pc, eigenvalues + + +class Record: + """Store gene expression data. + + A Record stores the gene expression data and related information contained + in a data file following the file format defined for Michael Eisen's + Cluster/TreeView program. + + Attributes: + - data: a matrix containing the gene expression data + - mask: a matrix containing only 1's and 0's, denoting which values + are present (1) or missing (0). If all items of mask are + one (no missing data), then mask is set to None. + - geneid: a list containing a unique identifier for each gene + (e.g., ORF name) + - genename: a list containing an additional description for each gene + (e.g., gene name) + - gweight: the weight to be used for each gene when calculating the + distance + - gorder: an array of real numbers indicating the preferred order of the + genes in the output file + - expid: a list containing a unique identifier for each sample. + - eweight: the weight to be used for each sample when calculating the + distance + - eorder: an array of real numbers indication the preferred order of the + samples in the output file + - uniqid: the string that was used instead of UNIQID in the input file. + + """ + + def __init__(self, handle=None): + """Read gene expression data from the file handle and return a Record. + + The file should be in the format defined for Michael Eisen's + Cluster/TreeView program. + """ + self.data = None + self.mask = None + self.geneid = None + self.genename = None + self.gweight = None + self.gorder = None + self.expid = None + self.eweight = None + self.eorder = None + self.uniqid = None + if not handle: + return + line = handle.readline().strip("\r\n").split("\t") + n = len(line) + self.uniqid = line[0] + self.expid = [] + cols = {0: "GENEID"} + for word in line[1:]: + if word == "NAME": + cols[line.index(word)] = word + self.genename = [] + elif word == "GWEIGHT": + cols[line.index(word)] = word + self.gweight = [] + elif word == "GORDER": + cols[line.index(word)] = word + self.gorder = [] + else: + self.expid.append(word) + self.geneid = [] + self.data = [] + self.mask = [] + needmask = 0 + for line in handle: + line = line.strip("\r\n").split("\t") + if len(line) != n: + raise ValueError( + "Line with %d columns found (expected %d)" % (len(line), n) + ) + if line[0] == "EWEIGHT": + i = max(cols) + 1 + self.eweight = numpy.array(line[i:], float) + continue + if line[0] == "EORDER": + i = max(cols) + 1 + self.eorder = numpy.array(line[i:], float) + continue + rowdata = [] + rowmask = [] + n = len(line) + for i in range(n): + word = line[i] + if i in cols: + if cols[i] == "GENEID": + self.geneid.append(word) + if cols[i] == "NAME": + self.genename.append(word) + if cols[i] == "GWEIGHT": + self.gweight.append(float(word)) + if cols[i] == "GORDER": + self.gorder.append(float(word)) + continue + if not word: + rowdata.append(0.0) + rowmask.append(0) + needmask = 1 + else: + rowdata.append(float(word)) + rowmask.append(1) + self.data.append(rowdata) + self.mask.append(rowmask) + self.data = numpy.array(self.data) + if needmask: + self.mask = numpy.array(self.mask, int) + else: + self.mask = None + if self.gweight: + self.gweight = numpy.array(self.gweight) + if self.gorder: + self.gorder = numpy.array(self.gorder) + + def treecluster(self, transpose=False, method="m", dist="e"): + """Apply hierarchical clustering and return a Tree object. + + The pairwise single, complete, centroid, and average linkage + hierarchical clustering methods are available. + + Keyword arguments: + - transpose: if False: rows are clustered; + if True: columns are clustered. + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + - method: specifies which linkage method is used: + - method == 's': Single pairwise linkage + - method == 'm': Complete (maximum) pairwise linkage (default) + - method == 'c': Centroid linkage + - method == 'a': Average pairwise linkage + + See the description of the Tree class for more information about + the Tree object returned by this method. + """ + if transpose: + weight = self.gweight + else: + weight = self.eweight + return treecluster(self.data, self.mask, weight, transpose, method, dist) + + def kcluster( + self, + nclusters=2, + transpose=False, + npass=1, + method="a", + dist="e", + initialid=None, + ): + """Apply k-means or k-median clustering. + + This method returns a tuple (clusterid, error, nfound). + + Keyword arguments: + - nclusters: number of clusters (the 'k' in k-means) + - transpose: if False, genes (rows) are clustered; + if True, samples (columns) are clustered. + - npass: number of times the k-means clustering algorithm is + performed, each time with a different (random) initial condition. + - method: specifies how the center of a cluster is found: + - method == 'a': arithmetic mean + - method == 'm': median + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + - initialid: the initial clustering from which the algorithm should + start. If initialid is None, the routine carries out npass + repetitions of the EM algorithm, each time starting from a different + random initial clustering. If initialid is given, the routine + carries out the EM algorithm only once, starting from the given + initial clustering and without randomizing the order in which items + are assigned to clusters (i.e., using the same order as in the data + matrix). In that case, the k-means algorithm is fully deterministic. + + Return values: + - clusterid: array containing the number of the cluster to which each + gene/sample was assigned in the best k-means clustering + solution that was found in the npass runs; + - error: the within-cluster sum of distances for the returned + k-means clustering solution; + - nfound: the number of times this solution was found. + """ + if transpose: + weight = self.gweight + else: + weight = self.eweight + return kcluster( + self.data, + nclusters, + self.mask, + weight, + transpose, + npass, + method, + dist, + initialid, + ) + + def somcluster( + self, transpose=False, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist="e" + ): + """Calculate a self-organizing map on a rectangular grid. + + The somcluster method returns a tuple (clusterid, celldata). + + Keyword arguments: + - transpose: if False, genes (rows) are clustered; + if True, samples (columns) are clustered. + - nxgrid: the horizontal dimension of the rectangular SOM map + - nygrid: the vertical dimension of the rectangular SOM map + - inittau: the initial value of tau (the neighborbood function) + - niter: the number of iterations + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + + Return values: + - clusterid: array with two columns, while the number of rows is equal + to the number of genes or the number of samples depending on + whether genes or samples are being clustered. Each row in + the array contains the x and y coordinates of the cell in the + rectangular SOM grid to which the gene or samples was assigned. + - celldata: an array with dimensions (nxgrid, nygrid, number of + samples) if genes are being clustered, or (nxgrid, nygrid, + number of genes) if samples are being clustered. Each item + [ix, iy] of this array is a 1D vector containing the gene + expression data for the centroid of the cluster in the SOM grid + cell with coordinates [ix, iy]. + """ + if transpose: + weight = self.gweight + else: + weight = self.eweight + return somcluster( + self.data, + self.mask, + weight, + transpose, + nxgrid, + nygrid, + inittau, + niter, + dist, + ) + + def clustercentroids(self, clusterid=None, method="a", transpose=False): + """Calculate the cluster centroids and return a tuple (cdata, cmask). + + The centroid is defined as either the mean or the median over all + items for each dimension. + + Keyword arguments: + - data: nrows x ncolumns array containing the expression data + - mask: nrows x ncolumns array of integers, showing which data + are missing. If mask[i, j]==0, then data[i, j] is missing. + - transpose: if False, gene (row) clusters are considered; + if True, sample (column) clusters are considered. + - clusterid: array containing the cluster number for each gene or + sample. The cluster number should be non-negative. + - method: specifies how the centroid is calculated: + - method == 'a': arithmetic mean over each dimension. (default) + - method == 'm': median over each dimension. + + Return values: + - cdata: 2D array containing the cluster centroids. If transpose + is False, then the dimensions of cdata are nclusters x ncolumns. + If transpose is True, then the dimensions of cdata are nrows x + nclusters. + - cmask: 2D array of integers describing which items in cdata, + if any, are missing. + """ + return clustercentroids(self.data, self.mask, clusterid, method, transpose) + + def clusterdistance( + self, index1=0, index2=0, method="a", dist="e", transpose=False + ): + """Calculate the distance between two clusters. + + Keyword arguments: + - index1: 1D array identifying which genes/samples belong to the + first cluster. If the cluster contains only one gene, then + index1 can also be written as a single integer. + - index2: 1D array identifying which genes/samples belong to the + second cluster. If the cluster contains only one gene, then + index2 can also be written as a single integer. + - transpose: if False, genes (rows) are clustered; + if True, samples (columns) are clustered. + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + - method: specifies how the distance between two clusters is defined: + - method == 'a': the distance between the arithmetic means + of the two clusters + - method == 'm': the distance between the medians of the + two clusters + - method == 's': the smallest pairwise distance between members + of the two clusters + - method == 'x': the largest pairwise distance between members + of the two clusters + - method == 'v': average of the pairwise distances between members + of the two clusters + - transpose: if False: clusters of rows are considered; + if True: clusters of columns are considered. + """ + if transpose: + weight = self.gweight + else: + weight = self.eweight + return clusterdistance( + self.data, self.mask, weight, index1, index2, method, dist, transpose + ) + + def distancematrix(self, transpose=False, dist="e"): + """Calculate the distance matrix and return it as a list of arrays. + + Keyword arguments: + - transpose: + if False: calculate the distances between genes (rows); + if True: calculate the distances between samples (columns). + - dist: specifies the distance function to be used: + - dist == 'e': Euclidean distance + - dist == 'b': City Block distance + - dist == 'c': Pearson correlation + - dist == 'a': absolute value of the correlation + - dist == 'u': uncentered correlation + - dist == 'x': absolute uncentered correlation + - dist == 's': Spearman's rank correlation + - dist == 'k': Kendall's tau + + Return value: + + The distance matrix is returned as a list of 1D arrays containing the + distance matrix between the gene expression data. The number of columns + in each row is equal to the row number. Hence, the first row has zero + length. An example of the return value is: + + matrix = [[], + array([1.]), + array([7., 3.]), + array([4., 2., 6.])] + + This corresponds to the distance matrix: + + [0., 1., 7., 4.] + [1., 0., 3., 2.] + [7., 3., 0., 6.] + [4., 2., 6., 0.] + + """ + if transpose: + weight = self.gweight + else: + weight = self.eweight + return distancematrix(self.data, self.mask, weight, transpose, dist) + + def save(self, jobname, geneclusters=None, expclusters=None): + """Save the clustering results. + + The saved files follow the convention for the Java TreeView program, + which can therefore be used to view the clustering result. + + Keyword arguments: + - jobname: The base name of the files to be saved. The filenames + are jobname.cdt, jobname.gtr, and jobname.atr for hierarchical + clustering, and jobname-K*.cdt, jobname-K*.kgg, jobname-K*.kag + for k-means clustering results. + - geneclusters: For hierarchical clustering results, geneclusters + is a Tree object as returned by the treecluster method. For k-means + clustering results, geneclusters is a vector containing ngenes + integers, describing to which cluster a given gene belongs. This + vector can be calculated by kcluster. + - expclusters: For hierarchical clustering results, expclusters + is a Tree object as returned by the treecluster method. For k-means + clustering results, expclusters is a vector containing nexps + integers, describing to which cluster a given sample belongs. This + vector can be calculated by kcluster. + """ + (ngenes, nexps) = numpy.shape(self.data) + if self.gorder is None: + gorder = numpy.arange(ngenes) + else: + gorder = self.gorder + if self.eorder is None: + eorder = numpy.arange(nexps) + else: + eorder = self.eorder + if ( + geneclusters is not None + and expclusters is not None + and type(geneclusters) != type(expclusters) + ): + raise ValueError( + "found one k-means and one hierarchical " + "clustering solution in geneclusters and " + "expclusters" + ) + gid = 0 + aid = 0 + filename = jobname + postfix = "" + if isinstance(geneclusters, Tree): + # This is a hierarchical clustering result. + geneindex = self._savetree(jobname, geneclusters, gorder, False) + gid = 1 + elif geneclusters is not None: + # This is a k-means clustering result. + filename = jobname + "_K" + k = max(geneclusters) + 1 + kggfilename = "%s_K_G%d.kgg" % (jobname, k) + geneindex = self._savekmeans(kggfilename, geneclusters, gorder, False) + postfix = "_G%d" % k + else: + geneindex = numpy.argsort(gorder) + if isinstance(expclusters, Tree): + # This is a hierarchical clustering result. + expindex = self._savetree(jobname, expclusters, eorder, True) + aid = 1 + elif expclusters is not None: + # This is a k-means clustering result. + filename = jobname + "_K" + k = max(expclusters) + 1 + kagfilename = "%s_K_A%d.kag" % (jobname, k) + expindex = self._savekmeans(kagfilename, expclusters, eorder, True) + postfix += "_A%d" % k + else: + expindex = numpy.argsort(eorder) + filename = filename + postfix + self._savedata(filename, gid, aid, geneindex, expindex) + + def _savetree(self, jobname, tree, order, transpose): + """Save the hierarchical clustering solution (PRIVATE).""" + if transpose: + extension = ".atr" + keyword = "ARRY" + else: + extension = ".gtr" + keyword = "GENE" + index = tree.sort(order) + nnodes = len(tree) + with open(jobname + extension, "w") as outputfile: + nodeID = [""] * nnodes + nodedist = numpy.array([node.distance for node in tree[:]]) + for nodeindex in range(nnodes): + min1 = tree[nodeindex].left + min2 = tree[nodeindex].right + nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1) + outputfile.write(nodeID[nodeindex]) + outputfile.write("\t") + if min1 < 0: + index1 = -min1 - 1 + outputfile.write(nodeID[index1] + "\t") + nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) + else: + outputfile.write("%s%dX\t" % (keyword, min1)) + if min2 < 0: + index2 = -min2 - 1 + outputfile.write(nodeID[index2] + "\t") + nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) + else: + outputfile.write("%s%dX\t" % (keyword, min2)) + outputfile.write(str(1.0 - nodedist[nodeindex])) + outputfile.write("\n") + return index + + def _savekmeans(self, filename, clusterids, order, transpose): + """Save the k-means clustering solution (PRIVATE).""" + if transpose: + label = "ARRAY" + names = self.expid + else: + label = self.uniqid + names = self.geneid + with open(filename, "w") as outputfile: + outputfile.write(label + "\tGROUP\n") + index = numpy.argsort(order) + n = len(names) + sortedindex = numpy.zeros(n, int) + counter = 0 + cluster = 0 + while counter < n: + for j in index: + if clusterids[j] == cluster: + outputfile.write("%s\t%s\n" % (names[j], cluster)) + sortedindex[counter] = j + counter += 1 + cluster += 1 + return sortedindex + + def _savedata(self, jobname, gid, aid, geneindex, expindex): + """Save the clustered data (PRIVATE).""" + if self.genename is None: + genename = self.geneid + else: + genename = self.genename + (ngenes, nexps) = numpy.shape(self.data) + with open(jobname + ".cdt", "w") as outputfile: + if self.mask is not None: + mask = self.mask + else: + mask = numpy.ones((ngenes, nexps), int) + if self.gweight is not None: + gweight = self.gweight + else: + gweight = numpy.ones(ngenes) + if self.eweight is not None: + eweight = self.eweight + else: + eweight = numpy.ones(nexps) + if gid: + outputfile.write("GID\t") + outputfile.write(self.uniqid) + outputfile.write("\tNAME\tGWEIGHT") + # Now add headers for data columns. + for j in expindex: + outputfile.write("\t%s" % self.expid[j]) + outputfile.write("\n") + if aid: + outputfile.write("AID") + if gid: + outputfile.write("\t") + outputfile.write("\t\t") + for j in expindex: + outputfile.write("\tARRY%dX" % j) + outputfile.write("\n") + outputfile.write("EWEIGHT") + if gid: + outputfile.write("\t") + outputfile.write("\t\t") + for j in expindex: + outputfile.write("\t%f" % eweight[j]) + outputfile.write("\n") + for i in geneindex: + if gid: + outputfile.write("GENE%dX\t" % i) + outputfile.write( + "%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i]) + ) + for j in expindex: + outputfile.write("\t") + if mask[i, j]: + outputfile.write(str(self.data[i, j])) + outputfile.write("\n") + + +def read(handle): + """Read gene expression data from the file handle and return a Record. + + The file should be in the file format defined for Michael Eisen's + Cluster/TreeView program. + """ + return Record(handle) + + +# Everything below is private +# + + +def __check_data(data): + if isinstance(data, numpy.ndarray): + data = numpy.require(data, dtype="d", requirements="C") + else: + data = numpy.array(data, dtype="d") + if data.ndim != 2: + raise ValueError("data should be 2-dimensional") + if numpy.isnan(data).any(): + raise ValueError("data contains NaN values") + return data + + +def __check_mask(mask, shape): + if mask is None: + return numpy.ones(shape, dtype="intc") + elif isinstance(mask, numpy.ndarray): + return numpy.require(mask, dtype="intc", requirements="C") + else: + return numpy.array(mask, dtype="intc") + + +def __check_weight(weight, ndata): + if weight is None: + return numpy.ones(ndata, dtype="d") + if isinstance(weight, numpy.ndarray): + weight = numpy.require(weight, dtype="d", requirements="C") + else: + weight = numpy.array(weight, dtype="d") + if numpy.isnan(weight).any(): + raise ValueError("weight contains NaN values") + return weight + + +def __check_initialid(initialid, npass, nitems): + if initialid is None: + if npass <= 0: + raise ValueError("npass should be a positive integer") + clusterid = numpy.empty(nitems, dtype="intc") + else: + npass = 0 + clusterid = numpy.array(initialid, dtype="intc") + return clusterid, npass + + +def __check_index(index): + if index is None: + return numpy.zeros(1, dtype="intc") + elif isinstance(index, numbers.Integral): + return numpy.array([index], dtype="intc") + elif isinstance(index, numpy.ndarray): + return numpy.require(index, dtype="intc", requirements="C") + else: + return numpy.array(index, dtype="intc") + + +def __check_distancematrix(distancematrix): + if distancematrix is None: + return distancematrix + if isinstance(distancematrix, numpy.ndarray): + distancematrix = numpy.require(distancematrix, dtype="d", requirements="C") + else: + try: + distancematrix = numpy.array(distancematrix, dtype="d") + except ValueError: + n = len(distancematrix) + d = [None] * n + for i, row in enumerate(distancematrix): + if isinstance(row, numpy.ndarray): + row = numpy.require(row, dtype="d", requirements="C") + else: + row = numpy.array(row, dtype="d") + if row.ndim != 1: + raise ValueError("row %d is not one-dimensional" % i) from None + m = len(row) + if m != i: + raise ValueError( + "row %d has incorrect size (%d, expected %d)" % (i, m, i) + ) from None + if numpy.isnan(row).any(): + raise ValueError("distancematrix contains NaN values") from None + d[i] = row + return d + if numpy.isnan(distancematrix).any(): + raise ValueError("distancematrix contains NaN values") + return distancematrix diff --git a/code/lib/Bio/Cluster/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Cluster/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..0a45e9e Binary files /dev/null and b/code/lib/Bio/Cluster/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Cluster/_cluster.cp37-win_amd64.pyd b/code/lib/Bio/Cluster/_cluster.cp37-win_amd64.pyd new file mode 100644 index 0000000..e000cce Binary files /dev/null and b/code/lib/Bio/Cluster/_cluster.cp37-win_amd64.pyd differ diff --git a/code/lib/Bio/Cluster/cluster.c b/code/lib/Bio/Cluster/cluster.c new file mode 100644 index 0000000..89db792 --- /dev/null +++ b/code/lib/Bio/Cluster/cluster.c @@ -0,0 +1,5061 @@ +/* The C clustering library. + * Copyright (C) 2002 Michiel Jan Laurens de Hoon. + * + * This library was written at the Laboratory of DNA Information Analysis, + * Human Genome Center, Institute of Medical Science, University of Tokyo, + * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan. + * Contact: michiel.dehoon 'AT' riken.jp + * + * Permission to use, copy, modify, and distribute this software and its + * documentation with or without modifications and for any purpose and + * without fee is hereby granted, provided that any copyright notices + * appear in all copies and that both those copyright notices and this + * permission notice appear in supporting documentation, and that the + * names of the contributors or copyright holders not be used in + * advertising or publicity pertaining to distribution of the software + * without specific prior permission. + * + * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT + * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include "cluster.h" + +/* ************************************************************************ */ +/* SORTING FUNCTIONS */ +/* +* C qsort() is very slow, much slower than C++ std::sort(). +* This is because qsort() doesn't utilize data-type information at compile time, +* and it has redundant pointer dereference since it requires a compare function. +* For projects that use old C, it's impossible to convert to C++/newer C. +* +* So we implement a simple quicksort that is ~~25% faster than std::sort() +* with mostly random data, and much faster with structured/sorted data +*/ + +static const int INF = INT_MAX; // 2^31 - 1 + +static int TEMP_SWAP_INT; +#define swap_int(x,y) {TEMP_SWAP_INT = (x); (x) = (y); (y) = TEMP_SWAP_INT;} + +/* For quicksort, we need to choose a random pivot. Any random function should work. Even bad ones. */ +static int +cheap_random() +{ + const int base = 2 * 100 * 1000 * 1000 + 33; + static int seed = 0; + seed = seed * 7 + 13; + if (seed > base) seed %= base; + return seed; +} + +static inline int +median_index_of3_index(const double arr[], int index[], const int a, const int b, const int c) +{ + if (arr[index[a]] < arr[index[b]]) { + if (arr[index[b]] < arr[index[c]]) return b; + else if (arr[index[a]] < arr[index[c]]) return c; + else return a; + } + else { + if (arr[index[a]] < arr[index[c]]) return a; + else if (arr[index[b]] < arr[index[c]]) return c; + else return b; + } +} + + +/* Insertion sort is best when the array is small. */ +static void +insertion_sort_index(const double a[], int index[], int l, int r) +{ + int i, j, current_index; + double value; + + if (r <= l) return; + i = l; j = r; + value = a[index[(l + r) >> 1]]; + while (i <= j) { + while (a[index[i]] < value) i++; + while (a[index[j]] > value) j--; + + if (i <= j) { + swap_int(index[i], index[j]); + i++; + j--; + } + } + + for (i = l + 1; i <= r; i++) { + j = i - 1; + value = a[index[i]]; + current_index = index[i]; + + while (j >= l && a[index[j]] > value) { + index[j + 1] = index[j]; + j--; + } + index[j + 1] = current_index; + } +} + +//*************** +static void +fastsort_partition_index(const double a[], int index[], const int left, const int right, int* first_end_ptr, int* second_start_ptr) { + int low, high, i, pivot, mid; + double value; + int increasing = 1, decreasing = 1; + + /*******/ + /* choose a random way to choose pivot, to prevent all possible worst-cases*/ + if ((right - left) & 1) pivot = left + cheap_random() % (right - left); + else pivot = median_index_of3_index(a, index, left, (left + right) >> 1, right); + value = a[index[pivot]]; + + /*******/ + /* Skip through smaller values on left and larger values on right*/ + low = left; high = right; + while (a[index[low]] < value) { + low++; + decreasing = 0; + if (a[index[low]] < a[index[low - 1]]) increasing = 0; + } + + while (a[index[high]] > value) { + high--; + decreasing = 0; + if (a[index[high]] > a[index[high + 1]]) increasing = 0; + } + + increasing &= a[index[high]] >= a[index[low]]; + decreasing &= a[index[high]] <= a[index[low]]; + + /*******/ + /* Resolve degenerate input cases */ + if (increasing) { + if ((right - left) & 1) { + for (i = low + 1; i <= high; i++) if (a[index[i]] < a[index[i - 1]]) { + increasing = 0; + break; + } + } + else { + for (i = high; i >= low + 1; i--) if (a[index[i]] < a[index[i - 1]]) { + increasing = 0; + break; + } + } + if (increasing) { /* sorted */ + *first_end_ptr = INF; + return; + } + } + + if (decreasing) { + if ((right - left) & 1) { + for (i = low + 1; i <= high; i++) if (a[index[i]] > a[index[i - 1]]) { + decreasing = 0; + break; + } + } + else { + for (i = high; i >= low + 1; i--) if (a[index[i]] > a[index[i - 1]]) { + decreasing = 0; + break; + } + } + if (decreasing) { + mid = (right - left + 1) >> 1; + for (i = 0; i < mid; i++) swap_int(index[left + i], index[right - i]); + *first_end_ptr = INF; + return; + } + } + + /******/ + while (low <= high) { + while (a[index[low]] < value) low++; + while (a[index[high]] > value) high--; + + if (low <= high) { + swap_int(index[low], index[high]); + low++; + high--; + } + } + + *first_end_ptr = high; + *second_start_ptr = low; +} + +//*************** +static void +fastsort_recursive_index(const double a[], int index[], int l, int r) +{ + int first_end, second_start; + while (l < r) { + if (r - l <= 70) { /* determined through experiments and benchmarks, not randomly. 70-150 works fine on random/mixed (hard) data */ + insertion_sort_index(a, index, l, r); + return; + } + + fastsort_partition_index(a, index, l, r, &first_end, &second_start); + if (first_end == INF) return; /* sorted */ + + /* Recurse into smaller branch to avoid stack overflow */ + if (first_end - l < r - second_start) { + fastsort_recursive_index(a, index, l, first_end); + l = second_start; + } + else { + fastsort_recursive_index(a, index, second_start, r); + r = first_end; + } + } +} + +/* ************************************************************************ */ + +double +mean(int n, double a[]) +/* + Add 4 elements at once instead of 1. The advantages are: + 1. less loop overhead + 2. compile with -O2 -> use SSE/AVX. + 3. without AVX, still faster because 4 independent additions -> parallel instruction possible + 4. smaller floating-point error +*/ +{ + double result = 0.; + int i; + double sum[4] = {0., 0., 0., 0.}; + + int nstep4 = n - n % 4; + for (i = 0; i < nstep4; i += 4) { + sum[0] += a[i]; + sum[1] += a[i + 1]; + sum[2] += a[i + 2]; + sum[3] += a[i + 3]; + } + + for (i = nstep4; i < n; i++) result += a[i]; + result += (sum[0] + sum[1]) + (sum[2] + sum[3]); + + return result / n; +} + +/* ************************************************************************ */ + +double +median(int n, double x[]) +/* +Find the median of X(1), ... , X(N), using as much of the quicksort +algorithm as is needed to isolate it. +N.B. On exit, the array X is partially ordered. +Based on Alan J. Miller's median.f90 routine. +*/ + +{ + int i, j; + int nr = n / 2; + int nl = nr - 1; + int even = 0; + /* hi & lo are position limits encompassing the median. */ + int lo = 0; + int hi = n-1; + + if (n == 2*nr) even = 1; + if (n < 3) { + if (n < 1) return 0.; + if (n == 1) return x[0]; + return 0.5*(x[0]+x[1]); + } + + /* Find median of 1st, middle & last values. */ + do { + int loop; + int mid = (lo + hi)/2; + double result = x[mid]; + double xlo = x[lo]; + double xhi = x[hi]; + if (xhixhi) result = xhi; + else if (resultresult) j--; + loop = 0; + if (inr) hi = j; + if (i == j) { + if (i == nl) lo = nl; + if (j == nr) hi = nr; + } + } + else { + if (jnr) hi = j; + /* Test whether median has been isolated. */ + if (i == j && i == nr) return result; + } + } + while (lox[hi]) { + double temp = x[lo]; + x[lo] = x[hi]; + x[hi] = temp; + } + return x[nr]; +} + +/* ********************************************************************** */ + +void +sort_index(int n, const double data[], int index[]) +/* Sets up an index table given the data, such that data[index[]] is in + * increasing order. Sorting is done on the indices; the array data + * is unchanged. + */ +{ + int i; + for (i = 0; i < n; i++) index[i] = i; + fastsort_recursive_index(data, index, 0, n - 1); +} + +/* ********************************************************************** */ + +static double* +getrank(int n, const double data[], const double weight[]) +/* Calculates the ranks of the elements in the array data. Two elements with + * the same value get the same rank, equal to the average of the ranks had the + * elements different values. The ranks are returned as a newly allocated + * array that should be freed by the calling routine. If getrank fails due to + * a memory allocation error, it returns NULL. + */ +{ + int i, j, k, l; + double* rank; + int* index; + double total = 0.0; + double subtotal; + double current; + double value; + + rank = malloc(n*sizeof(double)); + if (!rank) return NULL; + index = malloc(n*sizeof(int)); + if (!index) { + free(rank); + return NULL; + } + /* Call sort to get an index table */ + sort_index(n, data, index); + /* Build a rank table */ + k = 0; + j = index[0]; + current = data[j]; + subtotal = weight[j]; + for (i = 1; i < n; i++) { + j = index[i]; + value = data[j]; + if (value != current) { + current = value; + value = total + (subtotal + 1.0) / 2.0; + for (l = k; l < i; l++) rank[index[l]] = value; + k = i; + total += subtotal; + subtotal = 0.0; + } + subtotal += weight[j]; + } + value = total + (subtotal + 1.0) / 2.0; + for (l = k; l < i; l++) rank[index[l]] = value; + free(index); + return rank; +} + +/* ---------------------------------------------------------------------- */ + +static int +makedatamask(int nrows, int ncols, double*** pdata, int*** pmask) +{ + int i; + double** data; + int** mask; + + data = malloc(nrows*sizeof(double*)); + if (!data) return 0; + mask = malloc(nrows*sizeof(int*)); + if (!mask) { + free(data); + return 0; + } + for (i = 0; i < nrows; i++) { + data[i] = malloc(ncols*sizeof(double)); + if (!data[i]) break; + mask[i] = malloc(ncols*sizeof(int)); + if (!mask[i]) { + free(data[i]); + break; + } + } + if (i == nrows) { /* break not encountered */ + *pdata = data; + *pmask = mask; + return 1; + } + *pdata = NULL; + *pmask = NULL; + nrows = i; + for (i = 0; i < nrows; i++) { + free(data[i]); + free(mask[i]); + } + free(data); + free(mask); + return 0; +} + +/* ---------------------------------------------------------------------- */ + +static void +freedatamask(int n, double** data, int** mask) +{ + int i; + + for (i = 0; i < n; i++) { + free(mask[i]); + free(data[i]); + } + free(mask); + free(data); +} + +/* ---------------------------------------------------------------------- */ + +static double +find_closest_pair(int n, double** distmatrix, int* ip, int* jp) +/* +This function searches the distance matrix to find the pair with the shortest +distance between them. The indices of the pair are returned in ip and jp; the +distance itself is returned by the function. + +n (input) int +The number of elements in the distance matrix. + +distmatrix (input) double** +A ragged array containing the distance matrix. The number of columns in each +row is one less than the row index. + +ip (output) int* +A pointer to the integer that is to receive the first index of the pair with +the shortest distance. + +jp (output) int* +A pointer to the integer that is to receive the second index of the pair with +the shortest distance. +*/ +{ + int i, j; + double temp; + double distance = distmatrix[1][0]; + + *ip = 1; + *jp = 0; + for (i = 1; i < n; i++) { + for (j = 0; j < i; j++) { + temp = distmatrix[i][j]; + if (temp= n) { + /* Householder reduction to bidiagonal form */ + for (i = 0; i < n; i++) { + l = i + 1; + rv1[i] = scale * g; + g = 0.0; + s = 0.0; + scale = 0.0; + for (k = i; k < m; k++) scale += fabs(u[k][i]); + if (scale != 0.0) { + for (k = i; k < m; k++) { + u[k][i] /= scale; + s += u[k][i]*u[k][i]; + } + f = u[i][i]; + g = (f >= 0) ? -sqrt(s) : sqrt(s); + h = f * g - s; + u[i][i] = f - g; + if (i < n-1) { + for (j = l; j < n; j++) { + s = 0.0; + for (k = i; k < m; k++) s += u[k][i] * u[k][j]; + f = s / h; + for (k = i; k < m; k++) u[k][j] += f * u[k][i]; + } + } + for (k = i; k < m; k++) u[k][i] *= scale; + } + w[i] = scale * g; + g = 0.0; + s = 0.0; + scale = 0.0; + if (i= 0) ? -sqrt(s) : sqrt(s); + h = f * g - s; + u[i][l] = f - g; + for (k = l; k < n; k++) rv1[k] = u[i][k] / h; + for (j = l; j < m; j++) { + s = 0.0; + for (k = l; k < n; k++) s += u[j][k] * u[i][k]; + for (k = l; k < n; k++) u[j][k] += s * rv1[k]; + } + for (k = l; k < n; k++) u[i][k] *= scale; + } + } + anorm = max(anorm, fabs(w[i])+fabs(rv1[i])); + } + /* accumulation of right-hand transformations */ + for (i = n-1; i >= 0; i--) { + if (i < n-1) { + if (g != 0.0) { + for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g; + /* double division avoids possible underflow */ + for (j = l; j < n; j++) { + s = 0.0; + for (k = l; k < n; k++) s += u[i][k] * vt[j][k]; + for (k = l; k < n; k++) vt[j][k] += s * vt[i][k]; + } + } + } + for (j = l; j < n; j++) { + vt[j][i] = 0.0; + vt[i][j] = 0.0; + } + vt[i][i] = 1.0; + g = rv1[i]; + l = i; + } + /* accumulation of left-hand transformations */ + for (i = n-1; i >= 0; i--) { + l = i + 1; + g = w[i]; + if (i != n-1) + for (j = l; j < n; j++) u[i][j] = 0.0; + if (g != 0.0) { + if (i != n-1) { + for (j = l; j < n; j++) { + s = 0.0; + for (k = l; k < m; k++) s += u[k][i] * u[k][j]; + /* double division avoids possible underflow */ + f = (s / u[i][i]) / g; + for (k = i; k < m; k++) u[k][j] += f * u[k][i]; + } + } + for (j = i; j < m; j++) u[j][i] /= g; + } + else + for (j = i; j < m; j++) u[j][i] = 0.0; + u[i][i] += 1.0; + } + /* diagonalization of the bidiagonal form */ + for (k = n-1; k >= 0; k--) { + k1 = k-1; + its = 0; + while (1) { + /* test for splitting */ + for (l = k; l >= 0; l--) { + l1 = l-1; + if (fabs(rv1[l]) + anorm == anorm) break; + /* rv1[0] is always zero, so there is no exit + * through the bottom of the loop */ + if (fabs(w[l1]) + anorm == anorm) { + /* cancellation of rv1[l] if l greater than 0 */ + c = 0.0; + s = 1.0; + for (i = l; i <= k; i++) { + f = s * rv1[i]; + rv1[i] *= c; + if (fabs(f) + anorm == anorm) break; + g = w[i]; + h = sqrt(f*f+g*g); + w[i] = h; + c = g / h; + s = -f / h; + for (j = 0; j < m; j++) { + y = u[j][l1]; + z = u[j][i]; + u[j][l1] = y * c + z * s; + u[j][i] = -y * s + z * c; + } + } + break; + } + } + /* test for convergence */ + z = w[k]; + if (l == k) { /* convergence */ + if (z < 0.0) { + /* w[k] is made non-negative */ + w[k] = -z; + for (j = 0; j < n; j++) vt[k][j] = -vt[k][j]; + } + break; + } + else if (its == 30) { + ierr = k; + break; + } + else { + /* shift from bottom 2 by 2 minor */ + its++; + x = w[l]; + y = w[k1]; + g = rv1[k1]; + h = rv1[k]; + f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0*h*y); + g = sqrt(f*f+1.0); + f = ((x - z) * (x + z) + + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x; + /* next qr transformation */ + c = 1.0; + s = 1.0; + for (i1 = l; i1 <= k1; i1++) { + i = i1 + 1; + g = rv1[i]; + y = w[i]; + h = s * g; + g = c * g; + z = sqrt(f*f+h*h); + rv1[i1] = z; + c = f / z; + s = h / z; + f = x * c + g * s; + g = -x * s + g * c; + h = y * s; + y = y * c; + for (j = 0; j < n; j++) { + x = vt[i1][j]; + z = vt[i][j]; + vt[i1][j] = x * c + z * s; + vt[i][j] = -x * s + z * c; + } + z = sqrt(f*f+h*h); + w[i1] = z; + /* rotation can be arbitrary if z is zero */ + if (z != 0.0) { + c = f / z; + s = h / z; + } + f = c * g + s * y; + x = -s * g + c * y; + for (j = 0; j < m; j++) { + y = u[j][i1]; + z = u[j][i]; + u[j][i1] = y * c + z * s; + u[j][i] = -y * s + z * c; + } + } + rv1[l] = 0.0; + rv1[k] = f; + w[k] = x; + } + } + } + } + else /* m < n */ { + /* Householder reduction to bidiagonal form */ + for (i = 0; i < m; i++) { + l = i + 1; + rv1[i] = scale * g; + g = 0.0; + s = 0.0; + scale = 0.0; + for (k = i; k < n; k++) scale += fabs(u[i][k]); + if (scale != 0.0) { + for (k = i; k < n; k++) { + u[i][k] /= scale; + s += u[i][k]*u[i][k]; + } + f = u[i][i]; + g = (f >= 0) ? -sqrt(s) : sqrt(s); + h = f * g - s; + u[i][i] = f - g; + if (i < m-1) { + for (j = l; j < m; j++) { + s = 0.0; + for (k = i; k < n; k++) s += u[i][k] * u[j][k]; + f = s / h; + for (k = i; k < n; k++) u[j][k] += f * u[i][k]; + } + } + for (k = i; k < n; k++) u[i][k] *= scale; + } + w[i] = scale * g; + g = 0.0; + s = 0.0; + scale = 0.0; + if (i= 0) ? -sqrt(s) : sqrt(s); + h = f * g - s; + u[l][i] = f - g; + for (k = l; k < m; k++) rv1[k] = u[k][i] / h; + for (j = l; j < n; j++) { + s = 0.0; + for (k = l; k < m; k++) s += u[k][j] * u[k][i]; + for (k = l; k < m; k++) u[k][j] += s * rv1[k]; + } + for (k = l; k < m; k++) u[k][i] *= scale; + } + } + anorm = max(anorm, fabs(w[i])+fabs(rv1[i])); + } + /* accumulation of right-hand transformations */ + for (i = m-1; i >= 0; i--) { + if (i < m-1) { + if (g != 0.0) { + for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g; + /* double division avoids possible underflow */ + for (j = l; j < m; j++) { + s = 0.0; + for (k = l; k < m; k++) s += u[k][i] * vt[k][j]; + for (k = l; k < m; k++) vt[k][j] += s * vt[k][i]; + } + } + } + for (j = l; j < m; j++) { + vt[i][j] = 0.0; + vt[j][i] = 0.0; + } + vt[i][i] = 1.0; + g = rv1[i]; + l = i; + } + /* accumulation of left-hand transformations */ + for (i = m-1; i >= 0; i--) { + l = i + 1; + g = w[i]; + if (i != m-1) + for (j = l; j < m; j++) u[j][i] = 0.0; + if (g != 0.0) { + if (i != m-1) { + for (j = l; j < m; j++) { + s = 0.0; + for (k = l; k < n; k++) s += u[i][k] * u[j][k]; + /* double division avoids possible underflow */ + f = (s / u[i][i]) / g; + for (k = i; k < n; k++) u[j][k] += f * u[i][k]; + } + } + for (j = i; j < n; j++) u[i][j] /= g; + } + else + for (j = i; j < n; j++) u[i][j] = 0.0; + u[i][i] += 1.0; + } + /* diagonalization of the bidiagonal form */ + for (k = m-1; k >= 0; k--) { + k1 = k-1; + its = 0; + while (1) { + /* test for splitting */ + for (l = k; l >= 0; l--) { + l1 = l-1; + if (fabs(rv1[l]) + anorm == anorm) break; + /* rv1[0] is always zero, so there is no exit + * through the bottom of the loop */ + if (fabs(w[l1]) + anorm == anorm) { + /* cancellation of rv1[l] if l greater than 0 */ + c = 0.0; + s = 1.0; + for (i = l; i <= k; i++) { + f = s * rv1[i]; + rv1[i] *= c; + if (fabs(f) + anorm == anorm) break; + g = w[i]; + h = sqrt(f*f+g*g); + w[i] = h; + c = g / h; + s = -f / h; + for (j = 0; j < n; j++) { + y = u[l1][j]; + z = u[i][j]; + u[l1][j] = y * c + z * s; + u[i][j] = -y * s + z * c; + } + } + break; + } + } + /* test for convergence */ + z = w[k]; + if (l == k) /* convergence */ { + if (z < 0.0) { + /* w[k] is made non-negative */ + w[k] = -z; + for (j = 0; j < m; j++) vt[j][k] = -vt[j][k]; + } + break; + } + else if (its == 30) { + ierr = k; + break; + } + else { + /* shift from bottom 2 by 2 minor */ + its++; + x = w[l]; + y = w[k1]; + g = rv1[k1]; + h = rv1[k]; + f = ((y - z) * (y + z) + + (g - h) * (g + h)) / (2.0 * h * y); + g = sqrt(f*f+1.0); + f = ((x - z) * (x + z) + + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x; + /* next qr transformation */ + c = 1.0; + s = 1.0; + for (i1 = l; i1 <= k1; i1++) { + i = i1 + 1; + g = rv1[i]; + y = w[i]; + h = s * g; + g = c * g; + z = sqrt(f*f+h*h); + rv1[i1] = z; + c = f / z; + s = h / z; + f = x * c + g * s; + g = -x * s + g * c; + h = y * s; + y = y * c; + for (j = 0; j < m; j++) { + x = vt[j][i1]; + z = vt[j][i]; + vt[j][i1] = x * c + z * s; + vt[j][i] = -x * s + z * c; + } + z = sqrt(f*f+h*h); + w[i1] = z; + /* rotation can be arbitrary if z is zero */ + if (z != 0.0) { + c = f / z; + s = h / z; + } + f = c * g + s * y; + x = -s * g + c * y; + for (j = 0; j < n; j++) { + y = u[i1][j]; + z = u[i][j]; + u[i1][j] = y * c + z * s; + u[i][j] = -y * s + z * c; + } + } + rv1[l] = 0.0; + rv1[k] = f; + w[k] = x; + } + } + } + } + free(rv1); + return ierr; +} + +/* ********************************************************************* */ + +int +pca(int nrows, int ncolumns, double** u, double** v, double* w) +/* +Purpose +======= + +This subroutine uses the singular value decomposition to perform principal +components analysis of a real nrows by ncolumns rectangular matrix. + +Arguments +========= + +nrows (input) int +The number of rows in the matrix u. + +ncolumns (input) int +The number of columns in the matrix v. + +u (input) double[nrows][ncolumns] +On input, the array containing the data to which the principal component +analysis should be applied. The function assumes that the mean has already been +subtracted of each column, and hence that the mean of each column is zero. +On output, see below. + +v (input) double[n][n], where n = min(nrows, ncolumns) +Not used on input. + +w (input) double[n], where n = min(nrows, ncolumns) +Not used on input. + + +Return value +============ + +On output: + +If nrows >= ncolumns, then + +u contains the coordinates with respect to the principal components; +v contains the principal component vectors. + +The dot product u . v reproduces the data that were passed in u. + + +If nrows < ncolumns, then + +u contains the principal component vectors; +v contains the coordinates with respect to the principal components. + +The dot product v . u reproduces the data that were passed in u. + +The eigenvalues of the covariance matrix are returned in w. + +The arrays u, v, and w are sorted according to eigenvalue, with the largest +eigenvalues appearing first. + +The function returns 0 if successful, -1 if memory allocation fails, and a +positive integer if the singular value decomposition fails to converge. +*/ +{ + int i; + int j; + int error; + int* index = malloc(ncolumns*sizeof(int)); + double* temp = malloc(ncolumns*sizeof(double)); + + if (!index || !temp) { + if (index) free(index); + if (temp) free(temp); + return -1; + } + error = svd(nrows, ncolumns, u, w, v); + if (error == 0) { + if (nrows >= ncolumns) { + for (j = 0; j < ncolumns; j++) { + const double s = w[j]; + for (i = 0; i < nrows; i++) u[i][j] *= s; + } + sort_index(ncolumns, w, index); + for (i = 0; i < ncolumns/2; i++) { + j = index[i]; + index[i] = index[ncolumns-1-i]; + index[ncolumns-1-i] = j; + } + for (i = 0; i < nrows; i++) { + for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]]; + for (j = 0; j < ncolumns; j++) u[i][j] = temp[j]; + } + for (i = 0; i < ncolumns; i++) { + for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i]; + for (j = 0; j < ncolumns; j++) v[j][i] = temp[j]; + } + for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]]; + for (i = 0; i < ncolumns; i++) w[i] = temp[i]; + } + else /* nrows < ncolumns */ { + for (j = 0; j < nrows; j++) { + const double s = w[j]; + for (i = 0; i < nrows; i++) v[i][j] *= s; + } + sort_index(nrows, w, index); + for (i = 0; i < nrows/2; i++) { + j = index[i]; + index[i] = index[nrows-1-i]; + index[nrows-1-i] = j; + } + for (j = 0; j < ncolumns; j++) { + for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j]; + for (i = 0; i < nrows; i++) u[i][j] = temp[i]; + } + for (j = 0; j < nrows; j++) { + for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]]; + for (i = 0; i < nrows; i++) v[j][i] = temp[i]; + } + for (i = 0; i < nrows; i++) temp[i] = w[index[i]]; + for (i = 0; i < nrows; i++) w[i] = temp[i]; + } + } + free(index); + free(temp); + return error; +} + +/* ********************************************************************* */ + +static double +euclid(int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) + +/* +Purpose +======= + +The euclid routine calculates the weighted Euclidean distance between two +rows or columns in a matrix. + +Arguments +========= + +n (input) int +The number of elements in a row or column. If transpose == 0, then n is the +number of columns; otherwise, n is the number of rows. + +data1 (input) double array +The data array containing the first vector. + +data2 (input) double array +The data array containing the second vector. + +mask1 (input) int array +This array which elements in data1 are missing. If mask1[i][j] == 0, then +data1[i][j] is missing. + +mask2 (input) int array +This array which elements in data2 are missing. If mask2[i][j] == 0, then +data2[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +index1 (input) int +Index of the first row or column. + +index2 (input) int +Index of the second row or column. + +transpose (input) int +If transpose == 0, the distance between two rows in the matrix is calculated. +Otherwise, the distance between two columns in the matrix is calculated. + +============================================================================ +*/ +{ + double result = 0.0; + double tweight = 0; + int i; + + if (transpose == 0) /* Calculate the distance between two rows */ { + for (i = 0; i < n; i++) { + if (mask1[index1][i] && mask2[index2][i]) { + double term = data1[index1][i] - data2[index2][i]; + result += weight[i]*term*term; + tweight += weight[i]; + } + } + } + else { + for (i = 0; i < n; i++) { + if (mask1[i][index1] && mask2[i][index2]) { + double term = data1[i][index1] - data2[i][index2]; + result += weight[i]*term*term; + tweight += weight[i]; + } + } + } + if (!tweight) return 0; /* usually due to empty clusters */ + result /= tweight; + return result; +} + +/* ********************************************************************* */ + +static double +cityblock(int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) + +/* +Purpose +======= + +The cityblock routine calculates the weighted "City Block" distance between +two rows or columns in a matrix. City Block distance is defined as the +absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is +equivalent to taking an "up and over" path. + +Arguments +========= + +n (input) int +The number of elements in a row or column. If transpose == 0, then n is the +number of columns; otherwise, n is the number of rows. + +data1 (input) double array +The data array containing the first vector. + +data2 (input) double array +The data array containing the second vector. + +mask1 (input) int array +This array which elements in data1 are missing. If mask1[i][j] == 0, then +data1[i][j] is missing. + +mask2 (input) int array +This array which elements in data2 are missing. If mask2[i][j] == 0, then +data2[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +index1 (input) int +Index of the first row or column. + +index2 (input) int +Index of the second row or column. + +transpose (input) int +If transpose == 0, the distance between two rows in the matrix is calculated. +Otherwise, the distance between two columns in the matrix is calculated. + +============================================================================ */ +{ + double result = 0.; + double tweight = 0; + int i; + + if (transpose == 0) /* Calculate the distance between two rows */ { + for (i = 0; i < n; i++) { + if (mask1[index1][i] && mask2[index2][i]) { + double term = data1[index1][i] - data2[index2][i]; + result = result + weight[i]*fabs(term); + tweight += weight[i]; + } + } + } + else { + for (i = 0; i < n; i++) { + if (mask1[i][index1] && mask2[i][index2]) { + double term = data1[i][index1] - data2[i][index2]; + result = result + weight[i]*fabs(term); + tweight += weight[i]; + } + } + } + if (!tweight) return 0; /* usually due to empty clusters */ + result /= tweight; + return result; +} + +/* ********************************************************************* */ + +static double +correlation(int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) +/* +Purpose +======= + +The correlation routine calculates the weighted Pearson distance between two +rows or columns in a matrix. We define the Pearson distance as one minus the +Pearson correlation. +This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b. +but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold +(e.g., choose b = a + c). + +Arguments +========= + +n (input) int +The number of elements in a row or column. If transpose == 0, then n is the +number of columns; otherwise, n is the number of rows. + +data1 (input) double array +The data array containing the first vector. + +data2 (input) double array +The data array containing the second vector. + +mask1 (input) int array +This array which elements in data1 are missing. If mask1[i][j] == 0, then +data1[i][j] is missing. + +mask2 (input) int array +This array which elements in data2 are missing. If mask2[i][j] == 0, then +data2[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +index1 (input) int +Index of the first row or column. + +index2 (input) int +Index of the second row or column. + +transpose (input) int +If transpose == 0, the distance between two rows in the matrix is calculated. +Otherwise, the distance between two columns in the matrix is calculated. +============================================================================ +*/ +{ + double result = 0.; + double sum1 = 0.; + double sum2 = 0.; + double denom1 = 0.; + double denom2 = 0.; + double tweight = 0.; + + if (transpose == 0) /* Calculate the distance between two rows */ { + int i; + for (i = 0; i < n; i++) { + if (mask1[index1][i] && mask2[index2][i]) { + double term1 = data1[index1][i]; + double term2 = data2[index2][i]; + double w = weight[i]; + sum1 += w*term1; + sum2 += w*term2; + result += w*term1*term2; + denom1 += w*term1*term1; + denom2 += w*term2*term2; + tweight += w; + } + } + } + else { + int i; + for (i = 0; i < n; i++) { + if (mask1[i][index1] && mask2[i][index2]) { + double term1 = data1[i][index1]; + double term2 = data2[i][index2]; + double w = weight[i]; + sum1 += w*term1; + sum2 += w*term2; + result += w*term1*term2; + denom1 += w*term1*term1; + denom2 += w*term2*term2; + tweight += w; + } + } + } + if (!tweight) return 0; /* usually due to empty clusters */ + result -= sum1 * sum2 / tweight; + denom1 -= sum1 * sum1 / tweight; + denom2 -= sum2 * sum2 / tweight; + if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */ + if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */ + result = result / sqrt(denom1*denom2); + result = 1. - result; + return result; +} + +/* ********************************************************************* */ + +static double +acorrelation(int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) +/* +Purpose +======= + +The acorrelation routine calculates the weighted Pearson distance between two +rows or columns, using the absolute value of the correlation. +This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b. +but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold +(e.g., choose b = a + c). + +Arguments +========= + +n (input) int +The number of elements in a row or column. If transpose == 0, then n is the +number of columns; otherwise, n is the number of rows. + +data1 (input) double array +The data array containing the first vector. + +data2 (input) double array +The data array containing the second vector. + +mask1 (input) int array +This array which elements in data1 are missing. If mask1[i][j] == 0, then +data1[i][j] is missing. + +mask2 (input) int array +This array which elements in data2 are missing. If mask2[i][j] == 0, then +data2[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +index1 (input) int +Index of the first row or column. + +index2 (input) int +Index of the second row or column. + +transpose (input) int +If transpose == 0, the distance between two rows in the matrix is calculated. +Otherwise, the distance between two columns in the matrix is calculated. +============================================================================ +*/ +{ + double result = 0.; + double sum1 = 0.; + double sum2 = 0.; + double denom1 = 0.; + double denom2 = 0.; + double tweight = 0.; + + if (transpose == 0) /* Calculate the distance between two rows */ { + int i; + for (i = 0; i < n; i++) { + if (mask1[index1][i] && mask2[index2][i]) { + double term1 = data1[index1][i]; + double term2 = data2[index2][i]; + double w = weight[i]; + sum1 += w*term1; + sum2 += w*term2; + result += w*term1*term2; + denom1 += w*term1*term1; + denom2 += w*term2*term2; + tweight += w; + } + } + } + else { + int i; + for (i = 0; i < n; i++) { + if (mask1[i][index1] && mask2[i][index2]) { + double term1 = data1[i][index1]; + double term2 = data2[i][index2]; + double w = weight[i]; + sum1 += w*term1; + sum2 += w*term2; + result += w*term1*term2; + denom1 += w*term1*term1; + denom2 += w*term2*term2; + tweight += w; + } + } + } + if (!tweight) return 0; /* usually due to empty clusters */ + result -= sum1 * sum2 / tweight; + denom1 -= sum1 * sum1 / tweight; + denom2 -= sum2 * sum2 / tweight; + if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */ + if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */ + result = fabs(result) / sqrt(denom1*denom2); + result = 1. - result; + return result; +} + +/* ********************************************************************* */ + +static double +ucorrelation(int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) +/* +Purpose +======= + +The ucorrelation routine calculates the weighted Pearson distance between two +rows or columns, using the uncentered version of the Pearson correlation. In +the uncentered Pearson correlation, a zero mean is used for both vectors even +if the actual mean is nonzero. +This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b. +but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold +(e.g., choose b = a + c). + +Arguments +========= + +n (input) int +The number of elements in a row or column. If transpose == 0, then n is the +number of columns; otherwise, n is the number of rows. + +data1 (input) double array +The data array containing the first vector. + +data2 (input) double array +The data array containing the second vector. + +mask1 (input) int array +This array which elements in data1 are missing. If mask1[i][j] == 0, then +data1[i][j] is missing. + +mask2 (input) int array +This array which elements in data2 are missing. If mask2[i][j] == 0, then +data2[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +index1 (input) int +Index of the first row or column. + +index2 (input) int +Index of the second row or column. + +transpose (input) int +If transpose == 0, the distance between two rows in the matrix is calculated. +Otherwise, the distance between two columns in the matrix is calculated. +============================================================================ +*/ +{ + double result = 0.; + double denom1 = 0.; + double denom2 = 0.; + int flag = 0; + + /* flag will remain zero if no nonzero combinations of mask1 and mask2 are + * found. + */ + if (transpose == 0) /* Calculate the distance between two rows */ { + int i; + for (i = 0; i < n; i++) { + if (mask1[index1][i] && mask2[index2][i]) { + double term1 = data1[index1][i]; + double term2 = data2[index2][i]; + double w = weight[i]; + result += w*term1*term2; + denom1 += w*term1*term1; + denom2 += w*term2*term2; + flag = 1; + } + } + } + else { + int i; + for (i = 0; i < n; i++) { + if (mask1[i][index1] && mask2[i][index2]) { + double term1 = data1[i][index1]; + double term2 = data2[i][index2]; + double w = weight[i]; + result += w*term1*term2; + denom1 += w*term1*term1; + denom2 += w*term2*term2; + flag = 1; + } + } + } + if (!flag) return 0.; + if (denom1 == 0.) return 1.; + if (denom2 == 0.) return 1.; + result = result / sqrt(denom1*denom2); + result = 1. - result; + return result; +} + +/* ********************************************************************* */ + +static double +uacorrelation(int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) +/* +Purpose +======= + +The uacorrelation routine calculates the weighted Pearson distance between two +rows or columns, using the absolute value of the uncentered version of the +Pearson correlation. In the uncentered Pearson correlation, a zero mean is used +for both vectors even if the actual mean is nonzero. +This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b. +but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold +(e.g., choose b = a + c). + +Arguments +========= + +n (input) int +The number of elements in a row or column. If transpose == 0, then n is the +number of columns; otherwise, n is the number of rows. + +data1 (input) double array +The data array containing the first vector. + +data2 (input) double array +The data array containing the second vector. + +mask1 (input) int array +This array which elements in data1 are missing. If mask1[i][j] == 0, then +data1[i][j] is missing. + +mask2 (input) int array +This array which elements in data2 are missing. If mask2[i][j] == 0, then +data2[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +index1 (input) int +Index of the first row or column. + +index2 (input) int +Index of the second row or column. + +transpose (input) int +If transpose == 0, the distance between two rows in the matrix is calculated. +Otherwise, the distance between two columns in the matrix is calculated. +============================================================================ +*/ +{ + double result = 0.; + double denom1 = 0.; + double denom2 = 0.; + int flag = 0; + /* flag will remain zero if no nonzero combinations of mask1 and mask2 are + * found. + */ + + if (transpose == 0) /* Calculate the distance between two rows */ { + int i; + for (i = 0; i < n; i++) { + if (mask1[index1][i] && mask2[index2][i]) { + double term1 = data1[index1][i]; + double term2 = data2[index2][i]; + double w = weight[i]; + result += w*term1*term2; + denom1 += w*term1*term1; + denom2 += w*term2*term2; + flag = 1; + } + } + } + else { + int i; + for (i = 0; i < n; i++) { + if (mask1[i][index1] && mask2[i][index2]) { + double term1 = data1[i][index1]; + double term2 = data2[i][index2]; + double w = weight[i]; + result += w*term1*term2; + denom1 += w*term1*term1; + denom2 += w*term2*term2; + flag = 1; + } + } + } + if (!flag) return 0.; + if (denom1 == 0.) return 1.; + if (denom2 == 0.) return 1.; + result = fabs(result) / sqrt(denom1*denom2); + result = 1. - result; + return result; +} + +/* ********************************************************************* */ + +static double +spearman(int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) +/* +Purpose +======= + +The spearman routine calculates the Spearman distance between two rows or +columns. The Spearman distance is defined as one minus the Spearman rank +correlation. + +Arguments +========= + +n (input) int +The number of elements in a row or column. If transpose == 0, then n is the +number of columns; otherwise, n is the number of rows. + +data1 (input) double array +The data array containing the first vector. + +data2 (input) double array +The data array containing the second vector. + +mask1 (input) int array +This array which elements in data1 are missing. If mask1[i][j] == 0, then +data1[i][j] is missing. + +mask2 (input) int array +This array which elements in data2 are missing. If mask2[i][j] == 0, then +data2[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +index1 (input) int +Index of the first row or column. + +index2 (input) int +Index of the second row or column. + +transpose (input) int +If transpose == 0, the distance between two rows in the matrix is calculated. +Otherwise, the distance between two columns in the matrix is calculated. +============================================================================ +*/ +{ + int i; + int m = 0; + double* rank1; + double* rank2; + double result = 0.; + double denom1 = 0.; + double denom2 = 0.; + double sum1 = 0.; + double sum2 = 0.; + double totalweight = 0.; + double* tdata1; + double* tdata2; + + tdata1 = malloc(n*sizeof(double)); + if (!tdata1) return 0.0; /* Memory allocation error */ + tdata2 = malloc(n*sizeof(double)); + if (!tdata2) /* Memory allocation error */ { + free(tdata1); + return 0.0; + } + if (transpose == 0) { + for (i = 0; i < n; i++) { + if (mask1[index1][i] && mask2[index2][i]) { + tdata1[m] = data1[index1][i]; + tdata2[m] = data2[index2][i]; + m++; + } + } + } + else { + for (i = 0; i < n; i++) { + if (mask1[i][index1] && mask2[i][index2]) { + tdata1[m] = data1[i][index1]; + tdata2[m] = data2[i][index2]; + m++; + } + } + } + if (m == 0) { + free(tdata1); + free(tdata2); + return 0; + } + rank1 = getrank(m, tdata1, weight); + free(tdata1); + if (!rank1) { + free(tdata2); + return 0.0; /* Memory allocation error */ + } + rank2 = getrank(m, tdata2, weight); + free(tdata2); + if (!rank2) /* Memory allocation error */ { + free(rank1); + return 0.0; + } + for (i = 0; i < m; i++) { + const double term1 = rank1[i]; + const double term2 = rank2[i]; + const double w = weight[i]; + sum1 += term1 * w; + sum2 += term2 * w; + result += term1 * term2 * w; + denom1 += term1 * term1 * w; + denom2 += term2 * term2 * w; + totalweight += w; + } + /* Note: denom1 and denom2 cannot be calculated directly from the number + * of elements. If two elements have the same rank, the squared sum of + * their ranks will change. + */ + free(rank1); + free(rank2); + if (!totalweight) return 0; /* usually due to empty clusters */ + result -= sum1 * sum2 / totalweight; + denom1 -= sum1 * sum1 / totalweight; + denom2 -= sum2 * sum2 / totalweight; + if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */ + if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */ + result = result / sqrt(denom1*denom2); + result = 1. - result; + return result; +} + +/* ********************************************************************* */ + +static double +kendall(int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) +/* +Purpose +======= + +The kendall routine calculates the Kendall distance between two +rows or columns. The Kendall distance is defined as one minus Kendall's tau. + +Arguments +========= + +n (input) int +The number of elements in a row or column. If transpose == 0, then n is the +number of columns; otherwise, n is the number of rows. + +data1 (input) double array +The data array containing the first vector. + +data2 (input) double array +The data array containing the second vector. + +mask1 (input) int array +This array which elements in data1 are missing. If mask1[i][j] == 0, then +data1[i][j] is missing. + +mask2 (input) int array +This array which elements in data2 are missing. If mask2[i][j] == 0, then +data2[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +index1 (input) int +Index of the first row or column. + +index2 (input) int +Index of the second row or column. + +transpose (input) int +If transpose == 0, the distance between two rows in the matrix is calculated. +Otherwise, the distance between two columns in the matrix is calculated. +============================================================================ +*/ +{ + double con = 0; + double dis = 0; + double exx = 0; + double exy = 0; + int flag = 0; + /* flag will remain zero if no nonzero combinations of mask1 and mask2 are + * found. + */ + double denomx; + double denomy; + double tau; + int i, j; + + if (transpose == 0) { + for (i = 0; i < n; i++) { + if (mask1[index1][i] && mask2[index2][i]) { + for (j = 0; j < i; j++) { + if (mask1[index1][j] && mask2[index2][j]) { + const double x1 = data1[index1][i]; + const double x2 = data1[index1][j]; + const double y1 = data2[index2][i]; + const double y2 = data2[index2][j]; + const double w = weight[i] * weight[j]; + if (x1 < x2 && y1 < y2) con += w; + else if (x1 > x2 && y1 > y2) con += w; + else if (x1 < x2 && y1 > y2) dis += w; + else if (x1 > x2 && y1 < y2) dis += w; + else if (x1 == x2 && y1 != y2) exx += w; + else if (x1 != x2 && y1 == y2) exy += w; + flag = 1; + } + } + } + } + } + else { + for (i = 0; i < n; i++) { + if (mask1[i][index1] && mask2[i][index2]) { + for (j = 0; j < i; j++) { + if (mask1[j][index1] && mask2[j][index2]) { + const double x1 = data1[i][index1]; + const double x2 = data1[j][index1]; + const double y1 = data2[i][index2]; + const double y2 = data2[j][index2]; + const double w = weight[i] * weight[j]; + if (x1 < x2 && y1 < y2) con += w; + else if (x1 > x2 && y1 > y2) con += w; + else if (x1 < x2 && y1 > y2) dis += w; + else if (x1 > x2 && y1 < y2) dis += w; + else if (x1 == x2 && y1 != y2) exx += w; + else if (x1 != x2 && y1 == y2) exy += w; + flag = 1; + } + } + } + } + } + if (!flag) return 0.; + denomx = con + dis + exx; + denomy = con + dis + exy; + if (denomx == 0) return 1; + if (denomy == 0) return 1; + tau = (con-dis)/sqrt(denomx*denomy); + return 1.-tau; +} + +/* ********************************************************************* */ + +static double(*setmetric(char dist)) + (int, double**, double**, int**, int**, const double[], int, int, int) +{ + switch(dist) { + case 'e': return &euclid; + case 'b': return &cityblock; + case 'c': return &correlation; + case 'a': return &acorrelation; + case 'u': return &ucorrelation; + case 'x': return &uacorrelation; + case 's': return &spearman; + case 'k': return &kendall; + default: return &euclid; + } +} + +/* ********************************************************************* */ + +static double +uniform(void) +/* +Purpose +======= + +This routine returns a uniform random number between 0.0 and 1.0. Both 0.0 +and 1.0 are excluded. This random number generator is described in: + +Pierre l'Ecuyer +Efficient and Portable Combined Random Number Generators +Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749, 774. + +The first time this routine is called, it initializes the random number +generator using the current time. First, the current epoch time in seconds is +used as a seed for the random number generator in the C library. The first two +random numbers generated by this generator are used to initialize the random +number generator implemented in this routine. + + +Arguments +========= + +None. + + +Return value +============ + +A double-precison number between 0.0 and 1.0. +============================================================================ +*/ +{ + int z; + static const int m1 = 2147483563; + static const int m2 = 2147483399; + const double scale = 1.0/m1; + + static int s1 = 0; + static int s2 = 0; + + if (s1 == 0 || s2 == 0) { + /* initialize */ + unsigned int initseed = (unsigned int) time(0); + srand(initseed); + s1 = rand(); + s2 = rand(); + } + + do { + int k = s1/53668; + s1 = 40014*(s1-k*53668)-k*12211; + if (s1 < 0) s1+=m1; + k = s2/52774; + s2 = 40692*(s2-k*52774)-k*3791; + if (s2 < 0) s2 += m2; + z = s1-s2; + if (z < 1) z += (m1-1); + } while (z == m1); /* To avoid returning 1.0 */ + + return z*scale; +} + +/* ************************************************************************ */ + +static int +binomial(int n, double p) +/* +Purpose +======= + +This routine generates a random number between 0 and n inclusive, following +the binomial distribution with probability p and n trials. The routine is +based on the BTPE algorithm, described in: + +Voratas Kachitvichyanukul and Bruce W. Schmeiser: +Binomial Random Variate Generation +Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222. + + +Arguments +========= + +p (input) double +The probability of a single event. This probability should be less than or +equal to 0.5. + +n (input) int +The number of trials. + + +Return value +============ + +An integer drawn from a binomial distribution with parameters (p, n). + +============================================================================ +*/ +{ + const double q = 1 - p; + + if (n*p < 30.0) /* Algorithm BINV */ { + const double s = p/q; + const double a = (n+1)*s; + double r = exp(n*log(q)); /* pow() causes a crash on AIX */ + int x = 0; + double u = uniform(); + while (1) { + if (u < r) return x; + u -= r; + x++; + r *= (a/x)-s; + } + } + else /* Algorithm BTPE */ { + /* Step 0 */ + const double fm = n*p + p; + const int m = (int) fm; + const double p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5; + const double xm = m + 0.5; + const double xl = xm - p1; + const double xr = xm + p1; + const double c = 0.134 + 20.5/(15.3+m); + const double a = (fm-xl)/(fm-xl*p); + const double b = (xr-fm)/(xr*q); + const double lambdal = a*(1.0+0.5*a); + const double lambdar = b*(1.0+0.5*b); + const double p2 = p1*(1+2*c); + const double p3 = p2 + c/lambdal; + const double p4 = p3 + c/lambdar; + while (1) { + /* Step 1 */ + int y; + int k; + double u = uniform(); + double v = uniform(); + u *= p4; + if (u <= p1) return (int)(xm-p1*v+u); + /* Step 2 */ + if (u > p2) { + /* Step 3 */ + if (u > p3) { + /* Step 4 */ + y = (int)(xr-log(v)/lambdar); + if (y > n) continue; + /* Go to step 5 */ + v = v*(u-p3)*lambdar; + } + else { + y = (int)(xl+log(v)/lambdal); + if (y < 0) continue; + /* Go to step 5 */ + v = v*(u-p2)*lambdal; + } + } + else { + const double x = xl + (u-p1)/c; + v = v*c + 1.0 - fabs(m-x+0.5)/p1; + if (v > 1) continue; + /* Go to step 5 */ + y = (int)x; + } + /* Step 5 */ + /* Step 5.0 */ + k = abs(y-m); + if (k > 20 && k < 0.5*n*p*q-1.0) { + /* Step 5.2 */ + double rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) + + 0.1666666666666)/(n*p*q)+0.5); + double t = -k*k/(2*n*p*q); + double A = log(v); + if (A < t-rho) return y; + else if (A > t+rho) continue; + else { + /* Step 5.3 */ + double x1 = y+1; + double f1 = m+1; + double z = n+1-m; + double w = n-y+1; + double x2 = x1*x1; + double f2 = f1*f1; + double z2 = z*z; + double w2 = w*w; + if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w) + + (y-m)*log(w*p/(x1*q)) + + (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320. + + (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320. + + (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320. + + (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.) + continue; + return y; + } + } + else { + /* Step 5.1 */ + int i; + const double s = p/q; + const double aa = s*(n+1); + double f = 1.0; + for (i = m; i < y; f *= (aa/(++i)-s)); + for (i = y; i < m; f /= (aa/(++i)-s)); + if (v > f) continue; + return y; + } + } + } + return -1; +} + +/* ************************************************************************ */ + +static void +randomassign(int nclusters, int nelements, int clusterid[]) +/* +Purpose +======= + +The randomassign routine performs an initial random clustering, needed for +k-means or k-median clustering. Elements (genes or samples) are randomly +assigned to clusters. The number of elements in each cluster is chosen +randomly, making sure that each cluster will receive at least one element. + + +Arguments +========= + +nclusters (input) int +The number of clusters. + +nelements (input) int +The number of elements to be clustered (i.e., the number of genes or samples +to be clustered). + +clusterid (output) int[nelements] +The cluster number to which an element was assigned. + +============================================================================ +*/ +{ + int i, j; + int k = 0; + double p; + int n = nelements-nclusters; + + /* Draw the number of elements in each cluster from a multinomial + * distribution, reserving ncluster elements to set independently + * in order to guarantee that none of the clusters are empty. + */ + for (i = 0; i < nclusters-1; i++) { + p = 1.0/(nclusters-i); + j = binomial(n, p); + n -= j; + j += k+1; /* Assign at least one element to cluster i */ + for ( ; k < j; k++) clusterid[k] = i; + } + /* Assign the remaining elements to the last cluster */ + for ( ; k < nelements; k++) clusterid[k] = i; + + /* Create a random permutation of the cluster assignments */ + for (i = 0; i < nelements; i++) { + j = (int) (i + (nelements-i)*uniform()); + k = clusterid[j]; + clusterid[j] = clusterid[i]; + clusterid[i] = k; + } +} + +/* ********************************************************************* */ + +static void +getclustermeans(int nclusters, int nrows, int ncolumns, + double** data, int** mask, int clusterid[], double** cdata, int** cmask, + int transpose) +/* +Purpose +======= + +The getclustermeans routine calculates the cluster centroids, given to which +cluster each element belongs. The centroid is defined as the mean over all +elements for each dimension. + +Arguments +========= + +nclusters (input) int +The number of clusters. + +nrows (input) int +The number of rows in the gene expression data matrix, equal to the number of +genes. + +ncolumns (input) int +The number of columns in the gene expression data matrix, equal to the number +of samples. + +data (input) double[nrows][ncolumns] +The array containing the gene expression data. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If mask[i][j] == 0, then +data[i][j] is missing. + +clusterid (output) int[nrows] if transpose == 0 + int[ncolumns] otherwise +The cluster number to which each element belongs. If transpose == 0, then the +dimension of clusterid is equal to nrows (the number of genes). Otherwise, it +is equal to ncolumns (the number of samples). + +cdata (output) double[nclusters][ncolumns] if transpose == 0 + double[nrows][nclusters] otherwise +On exit of getclustermeans, this array contains the cluster centroids. + +cmask (output) int[nclusters][ncolumns] if transpose == 0 + int[nrows][nclusters] otherwise +This array shows which data values of are missing for each centroid. If +cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for a +centroid if all corresponding data values of the cluster members are missing. + +transpose (input) int +If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters +of columns (samples) are specified. + +======================================================================== +*/ +{ + int i, j, k; + + if (transpose == 0) { + for (i = 0; i < nclusters; i++) { + for (j = 0; j < ncolumns; j++) { + cmask[i][j] = 0; + cdata[i][j] = 0.; + } + } + for (k = 0; k < nrows; k++) { + i = clusterid[k]; + for (j = 0; j < ncolumns; j++) { + if (mask[k][j] != 0) { + cdata[i][j] += data[k][j]; + cmask[i][j]++; + } + } + } + for (i = 0; i < nclusters; i++) { + for (j = 0; j < ncolumns; j++) { + if (cmask[i][j]>0) { + cdata[i][j] /= cmask[i][j]; + cmask[i][j] = 1; + } + } + } + } + else { + for (i = 0; i < nrows; i++) { + for (j = 0; j < nclusters; j++) { + cdata[i][j] = 0.; + cmask[i][j] = 0; + } + } + for (k = 0; k < ncolumns; k++) { + i = clusterid[k]; + for (j = 0; j < nrows; j++) { + if (mask[j][k] != 0) { + cdata[j][i] += data[j][k]; + cmask[j][i]++; + } + } + } + for (i = 0; i < nrows; i++) { + for (j = 0; j < nclusters; j++) { + if (cmask[i][j]>0) { + cdata[i][j] /= cmask[i][j]; + cmask[i][j] = 1; + } + } + } + } +} + +/* ********************************************************************* */ + +static void +getclustermedians(int nclusters, int nrows, int ncolumns, + double** data, int** mask, int clusterid[], double** cdata, int** cmask, + int transpose, double cache[]) +/* +Purpose +======= + +The getclustermedians routine calculates the cluster centroids, given to which +cluster each element belongs. The centroid is defined as the median over all +elements for each dimension. + +Arguments +========= + +nclusters (input) int +The number of clusters. + +nrows (input) int +The number of rows in the gene expression data matrix, equal to the number of +genes. + +ncolumns (input) int +The number of columns in the gene expression data matrix, equal to the number +of samples. + +data (input) double[nrows][ncolumns] +The array containing the gene expression data. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If mask[i][j] == 0, then +data[i][j] is missing. + +clusterid (output) int[nrows] if transpose == 0 + int[ncolumns] otherwise +The cluster number to which each element belongs. If transpose == 0, then the +dimension of clusterid is equal to nrows (the number of genes). Otherwise, it +is equal to ncolumns (the number of samples). + +cdata (output) double[nclusters][ncolumns] if transpose == 0 + double[nrows][nclusters] otherwise +On exit of getclustermedians, this array contains the cluster centroids. + +cmask (output) int[nclusters][ncolumns] if transpose == 0 + int[nrows][nclusters] otherwise +This array shows which data values of are missing for each centroid. If +cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for +a centroid if all corresponding data values of the cluster members are missing. + +transpose (input) int +If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters +of columns (samples) are specified. + +cache (input) double[nrows] if transpose == 0 + double[ncolumns] otherwise +This array should be allocated before calling getclustermedians; its contents +on input is not relevant. This array is used as a temporary storage space when +calculating the medians. + +======================================================================== +*/ +{ + int i, j, k; + + if (transpose == 0) { + for (i = 0; i < nclusters; i++) { + for (j = 0; j < ncolumns; j++) { + int count = 0; + for (k = 0; k < nrows; k++) { + if (i == clusterid[k] && mask[k][j]) { + cache[count] = data[k][j]; + count++; + } + } + if (count>0) { + cdata[i][j] = median(count, cache); + cmask[i][j] = 1; + } + else { + cdata[i][j] = 0.; + cmask[i][j] = 0; + } + } + } + } + else { + for (i = 0; i < nclusters; i++) { + for (j = 0; j < nrows; j++) { + int count = 0; + for (k = 0; k < ncolumns; k++) { + if (i == clusterid[k] && mask[j][k]) { + cache[count] = data[j][k]; + count++; + } + } + if (count>0) { + cdata[j][i] = median(count, cache); + cmask[j][i] = 1; + } + else { + cdata[j][i] = 0.; + cmask[j][i] = 0; + } + } + } + } +} + +/* ********************************************************************* */ + +int +getclustercentroids(int nclusters, int nrows, int ncolumns, + double** data, int** mask, int clusterid[], double** cdata, int** cmask, + int transpose, char method) +/* +Purpose +======= + +The getclustercentroids routine calculates the cluster centroids, given to +which cluster each element belongs. Depending on the argument method, the +centroid is defined as either the mean or the median for each dimension over +all elements belonging to a cluster. + +Arguments +========= + +nclusters (input) int +The number of clusters. + +nrows (input) int +The number of rows in the gene expression data matrix, equal to the number of +genes. + +ncolumns (input) int +The number of columns in the gene expression data matrix, equal to the number +of samples. + +data (input) double[nrows][ncolumns] +The array containing the gene expression data. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If mask[i][j] == 0, then +data[i][j] is missing. + +clusterid (output) int[nrows] if transpose == 0 + int[ncolumns] otherwise +The cluster number to which each element belongs. If transpose == 0, then the +dimension of clusterid is equal to nrows (the number of genes). Otherwise, it +is equal to ncolumns (the number of samples). + +cdata (output) double[nclusters][ncolumns] if transpose == 0 + double[nrows][nclusters] otherwise +On exit of getclustercentroids, this array contains the cluster centroids. + +cmask (output) int[nclusters][ncolumns] if transpose == 0 + int[nrows][nclusters] otherwise +This array shows which data values of are missing for each centroid. If +cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for +a centroid if all corresponding data values of the cluster members are missing. + +transpose (input) int +If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters +of columns (samples) are specified. + +method (input) char +For method == 'a', the centroid is defined as the mean over all elements +belonging to a cluster for each dimension. +For method == 'm', the centroid is defined as the median over all elements +belonging to a cluster for each dimension. + +Return value +============ + +The function returns an integer to indicate success or failure. If a +memory error occurs, or if method is not 'm' or 'a', getclustercentroids +returns 0. If successful, getclustercentroids returns 1. +======================================================================== +*/ +{ + switch(method) { + case 'm': { + const int nelements = (transpose == 0) ? nrows : ncolumns; + double* cache = malloc(nelements*sizeof(double)); + if (!cache) return 0; + getclustermedians(nclusters, nrows, ncolumns, data, mask, + clusterid, cdata, cmask, transpose, cache); + free(cache); + return 1; + } + case 'a': { + getclustermeans(nclusters, nrows, ncolumns, data, mask, + clusterid, cdata, cmask, transpose); + return 1; + } + } + return 0; +} + +/* ********************************************************************* */ + +void +getclustermedoids(int nclusters, int nelements, double** distance, + int clusterid[], int centroids[], double errors[]) +/* +Purpose +======= + +The getclustermedoids routine calculates the cluster centroids, given to which +cluster each element belongs. The centroid is defined as the element with the +smallest sum of distances to the other elements. + +Arguments +========= + +nclusters (input) int +The number of clusters. + +nelements (input) int +The total number of elements. + +distmatrix (input) double array, ragged + (number of rows is nelements, number of columns is equal to the row number) +The distance matrix. To save space, the distance matrix is given in the +form of a ragged array. The distance matrix is symmetric and has zeros +on the diagonal. See distancematrix for a description of the content. + +clusterid (output) int[nelements] +The cluster number to which each element belongs. + +centroid (output) int[nclusters] +The index of the element that functions as the centroid for each cluster. + +errors (output) double[nclusters] +The within-cluster sum of distances between the items and the cluster +centroid. + +======================================================================== +*/ +{ + int i, j, k; + + for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX; + for (i = 0; i < nelements; i++) { + double d = 0.0; + j = clusterid[i]; + for (k = 0; k < nelements; k++) { + if (i == k || clusterid[k]!=j) continue; + d += (i < k ? distance[k][i] : distance[i][k]); + if (d > errors[j]) break; + } + if (d < errors[j]) { + errors[j] = d; + centroids[j] = i; + } + } +} + +/* ********************************************************************* */ + +static int +kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask, + double weight[], int transpose, int npass, char dist, + double** cdata, int** cmask, int clusterid[], double* error, + int tclusterid[], int counts[], int mapping[]) +{ + int i, j, k; + const int nelements = (transpose == 0) ? nrows : ncolumns; + const int ndata = (transpose == 0) ? ncolumns : nrows; + int ifound = 1; + int ipass = 0; + /* Set the metric function as indicated by dist */ + double (*metric) (int, double**, double**, int**, int**, + const double[], int, int, int) = setmetric(dist); + + /* Save the clustering solution periodically and check if it reappears */ + int* saved = malloc(nelements*sizeof(int)); + if (saved == NULL) return -1; + + *error = DBL_MAX; + + do { + double total = DBL_MAX; + int counter = 0; + int period = 10; + + /* Perform the EM algorithm. + * First, randomly assign elements to clusters. */ + if (npass != 0) randomassign(nclusters, nelements, tclusterid); + + for (i = 0; i < nclusters; i++) counts[i] = 0; + for (i = 0; i < nelements; i++) counts[tclusterid[i]]++; + + /* Start the loop */ + while (1) { + double previous = total; + total = 0.0; + + if (counter % period == 0) { + /* Save the current cluster assignments */ + for (i = 0; i < nelements; i++) saved[i] = tclusterid[i]; + if (period < INT_MAX / 2) period *= 2; + } + counter++; + + /* Find the center */ + getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid, + cdata, cmask, transpose); + + for (i = 0; i < nelements; i++) { + double distance; + /* Calculate the distances */ + k = tclusterid[i]; + if (counts[k] == 1) continue; + /* No reassignment if that would lead to an empty cluster */ + /* Treat the present cluster as a special case */ + distance = metric(ndata, data, cdata, mask, cmask, weight, + i, k, transpose); + for (j = 0; j < nclusters; j++) { + double tdistance; + if (j == k) continue; + tdistance = metric(ndata, data, cdata, mask, cmask, weight, + i, j, transpose); + if (tdistance < distance) { + distance = tdistance; + counts[tclusterid[i]]--; + tclusterid[i] = j; + counts[j]++; + } + } + total += distance; + } + if (total >= previous) break; + /* total >= previous is FALSE on some machines even if total and + * previous are bitwise identical. */ + for (i = 0; i < nelements; i++) + if (saved[i]!=tclusterid[i]) break; + if (i == nelements) + break; /* Identical solution found; break out of this loop */ + } + + if (npass <= 1) { + *error = total; + break; + } + + for (i = 0; i < nclusters; i++) mapping[i] = -1; + for (i = 0; i < nelements; i++) { + j = tclusterid[i]; + k = clusterid[i]; + if (mapping[k] == -1) mapping[k] = j; + else if (mapping[k] != j) { + if (total < *error) { + ifound = 1; + *error = total; + for (j = 0; j < nelements; j++) + clusterid[j] = tclusterid[j]; + } + break; + } + } + if (i == nelements) ifound++; /* break statement not encountered */ + } while (++ipass < npass); + + free(saved); + return ifound; +} + +/* ---------------------------------------------------------------------- */ + +static int +kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask, + double weight[], int transpose, int npass, char dist, + double** cdata, int** cmask, int clusterid[], double* error, + int tclusterid[], int counts[], int mapping[], double cache[]) +{ + int i, j, k; + const int nelements = (transpose == 0) ? nrows : ncolumns; + const int ndata = (transpose == 0) ? ncolumns : nrows; + int ifound = 1; + int ipass = 0; + int* saved; + /* Set the metric function as indicated by dist */ + double (*metric) (int, double**, double**, int**, int**, + const double[], int, int, int) = setmetric(dist); + + /* Save the clustering solution periodically and check if it reappears */ + saved = malloc(nelements*sizeof(int)); + if (saved == NULL) return -1; + + *error = DBL_MAX; + + do { + double total = DBL_MAX; + int counter = 0; + int period = 10; + + /* Perform the EM algorithm. + * First, randomly assign elements to clusters. */ + if (npass != 0) randomassign(nclusters, nelements, tclusterid); + + for (i = 0; i < nclusters; i++) counts[i] = 0; + for (i = 0; i < nelements; i++) counts[tclusterid[i]]++; + + /* Start the loop */ + while (1) { + double previous = total; + total = 0.0; + + if (counter % period == 0) { + /* Save the current cluster assignments */ + for (i = 0; i < nelements; i++) saved[i] = tclusterid[i]; + if (period < INT_MAX / 2) period *= 2; + } + counter++; + + /* Find the center */ + getclustermedians(nclusters, nrows, ncolumns, data, mask, + tclusterid, cdata, cmask, transpose, cache); + + for (i = 0; i < nelements; i++) { + /* Calculate the distances */ + double distance; + k = tclusterid[i]; + if (counts[k] == 1) continue; + /* No reassignment if that would lead to an empty cluster */ + /* Treat the present cluster as a special case */ + distance = metric(ndata, data, cdata, mask, cmask, weight, + i, k, transpose); + for (j = 0; j < nclusters; j++) { + double tdistance; + if (j == k) continue; + tdistance = metric(ndata, data, cdata, mask, cmask, weight, + i, j, transpose); + if (tdistance < distance) { + distance = tdistance; + counts[tclusterid[i]]--; + tclusterid[i] = j; + counts[j]++; + } + } + total += distance; + } + if (total >= previous) break; + /* total >= previous is FALSE on some machines even if total and + * previous are bitwise identical. */ + for (i = 0; i < nelements; i++) + if (saved[i]!=tclusterid[i]) break; + if (i == nelements) + break; /* Identical solution found; break out of this loop */ + } + + if (npass <= 1) { + *error = total; + break; + } + + for (i = 0; i < nclusters; i++) mapping[i] = -1; + for (i = 0; i < nelements; i++) { + j = tclusterid[i]; + k = clusterid[i]; + if (mapping[k] == -1) mapping[k] = j; + else if (mapping[k] != j) { + if (total < *error) { + ifound = 1; + *error = total; + for (j = 0; j < nelements; j++) + clusterid[j] = tclusterid[j]; + } + break; + } + } + if (i == nelements) ifound++; /* break statement not encountered */ + } while (++ipass < npass); + + free(saved); + return ifound; +} + +/* ********************************************************************* */ + +void +kcluster(int nclusters, int nrows, int ncolumns, double** data, int** mask, + double weight[], int transpose, int npass, char method, char dist, + int clusterid[], double* error, int* ifound) +/* +Purpose +======= + +The kcluster routine performs k-means or k-median clustering on a given set of +elements, using the specified distance measure. The number of clusters is given +by the user. Multiple passes are being made to find the optimal clustering +solution, each time starting from a different initial clustering. + + +Arguments +========= + +nclusters (input) int +The number of clusters to be found. + +data (input) double[nrows][ncolumns] +The array containing the data of the elements to be clustered (i.e., the gene +expression data). + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If +mask[i][j] == 0, then data[i][j] is missing. + +nrows (input) int +The number of rows in the data matrix, equal to the number of genes. + +ncolumns (input) int +The number of columns in the data matrix, equal to the number of samples. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +transpose (input) int +If transpose == 0, the rows of the matrix are clustered. Otherwise, columns +of the matrix are clustered. + +npass (input) int +The number of times clustering is performed. Clustering is performed npass +times, each time starting from a different (random) initial assignment of +genes to clusters. The clustering solution with the lowest within-cluster sum +of distances is chosen. +If npass == 0, then the clustering algorithm will be run once, where the +initial assignment of elements to clusters is taken from the clusterid array. + +method (input) char +Defines whether the arithmetic mean (method == 'a') or the median +(method == 'm') is used to calculate the cluster center. + +dist (input) char +Defines which distance measure is used, as given by the table: +dist == 'e': Euclidean distance +dist == 'b': City-block distance +dist == 'c': correlation +dist == 'a': absolute value of the correlation +dist == 'u': uncentered correlation +dist == 'x': absolute uncentered correlation +dist == 's': Spearman's rank correlation +dist == 'k': Kendall's tau +For other values of dist, the default (Euclidean distance) is used. + +clusterid (output; input) int[nrows] if transpose == 0 + int[ncolumns] otherwise +The cluster number to which a gene or microarray was assigned. If npass == 0, +then on input clusterid contains the initial clustering assignment from which +the clustering algorithm starts. On output, it contains the clustering solution +that was found. + +error (output) double* +The sum of distances to the cluster center of each item in the optimal k-means +clustering solution that was found. + +ifound (output) int* +The number of times the optimal clustering solution was +found. The value of ifound is at least 1; its maximum value is npass. If the +number of clusters is larger than the number of elements being clustered, +*ifound is set to 0 as an error code. If a memory allocation error occurs, +*ifound is set to -1. + +======================================================================== +*/ +{ + const int nelements = (transpose == 0) ? nrows : ncolumns; + const int ndata = (transpose == 0) ? ncolumns : nrows; + + int i; + int ok; + int* tclusterid; + int* mapping = NULL; + double** cdata; + int** cmask; + int* counts; + + if (nelements < nclusters) { + *ifound = 0; + return; + } + /* More clusters asked for than elements available */ + + *ifound = -1; + + /* This will contain the number of elements in each cluster, which is + * needed to check for empty clusters. */ + counts = malloc(nclusters*sizeof(int)); + if (!counts) return; + + /* Find out if the user specified an initial clustering */ + if (npass <= 1) tclusterid = clusterid; + else { + tclusterid = malloc(nelements*sizeof(int)); + if (!tclusterid) { + free(counts); + return; + } + mapping = malloc(nclusters*sizeof(int)); + if (!mapping) { + free(counts); + free(tclusterid); + return; + } + for (i = 0; i < nelements; i++) clusterid[i] = 0; + } + + /* Allocate space to store the centroid data */ + if (transpose == 0) ok = makedatamask(nclusters, ndata, &cdata, &cmask); + else ok = makedatamask(ndata, nclusters, &cdata, &cmask); + if (!ok) { + free(counts); + if (npass>1) { + free(tclusterid); + free(mapping); + } + return; + } + + if (method == 'm') { + double* cache = malloc(nelements*sizeof(double)); + if (cache) { + *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight, + transpose, npass, dist, cdata, cmask, clusterid, + error, tclusterid, counts, mapping, cache); + free(cache); + } + } + else + *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight, + transpose, npass, dist, cdata, cmask, clusterid, + error, tclusterid, counts, mapping); + + /* Deallocate temporarily used space */ + if (npass > 1) { + free(mapping); + free(tclusterid); + } + + if (transpose == 0) freedatamask(nclusters, cdata, cmask); + else freedatamask(ndata, cdata, cmask); + + free(counts); +} + +/* *********************************************************************** */ + +void +kmedoids(int nclusters, int nelements, double** distmatrix, int npass, + int clusterid[], double* error, int* ifound) +/* +Purpose +======= + +The kmedoids routine performs k-medoids clustering on a given set of elements, +using the distance matrix and the number of clusters passed by the user. +Multiple passes are being made to find the optimal clustering solution, each +time starting from a different initial clustering. + + +Arguments +========= + +nclusters (input) int +The number of clusters to be found. + +nelements (input) int +The number of elements to be clustered. + +distmatrix (input) double array, ragged + (number of rows is nelements, number of columns is equal to the row number) +The distance matrix. To save space, the distance matrix is given in the +form of a ragged array. The distance matrix is symmetric and has zeros +on the diagonal. See distancematrix for a description of the content. + +npass (input) int +The number of times clustering is performed. Clustering is performed npass +times, each time starting from a different (random) initial assignment of genes +to clusters. The clustering solution with the lowest within-cluster sum of +distances is chosen. +If npass == 0, then the clustering algorithm will be run once, where the +initial assignment of elements to clusters is taken from the clusterid array. + +clusterid (output; input) int[nelements] +On input, if npass == 0, then clusterid contains the initial clustering +assignment from which the clustering algorithm starts; all numbers in clusterid +should be between zero and nelements-1 inclusive. If npass != 0, clusterid is +ignored on input. +On output, clusterid contains the clustering solution that was found: clusterid +contains the number of the cluster to which each item was assigned. On output, +the number of a cluster is defined as the item number of the centroid of the +cluster. + +error (output) double +The sum of distances to the cluster center of each item in the optimal +k-medoids clustering solution that was found. + +ifound (output) int +If kmedoids is successful: the number of times the optimal clustering solution +was found. The value of ifound is at least 1; its maximum value is npass. +If the user requested more clusters than elements available, ifound is set +to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1. + +======================================================================== +*/ +{ + int i, j, icluster; + int* tclusterid; + int* saved; + int* centroids; + double* errors; + int ipass = 0; + + if (nelements < nclusters) { + *ifound = 0; + return; + } /* More clusters asked for than elements available */ + + *ifound = -1; + + /* Save the clustering solution periodically and check if it reappears */ + saved = malloc(nelements*sizeof(int)); + if (saved == NULL) return; + + centroids = malloc(nclusters*sizeof(int)); + if (!centroids) { + free(saved); + return; + } + + errors = malloc(nclusters*sizeof(double)); + if (!errors) { + free(saved); + free(centroids); + return; + } + + /* Find out if the user specified an initial clustering */ + if (npass <= 1) tclusterid = clusterid; + else { + tclusterid = malloc(nelements*sizeof(int)); + if (!tclusterid) { + free(saved); + free(centroids); + free(errors); + return; + } + for (i = 0; i < nelements; i++) clusterid[i] = -1; + } + + *error = DBL_MAX; + do /* Start the loop */ { + double total = DBL_MAX; + int counter = 0; + int period = 10; + + if (npass != 0) randomassign(nclusters, nelements, tclusterid); + while (1) { + double previous = total; + total = 0.0; + + if (counter % period == 0) { + /* Save the current cluster assignments */ + for (i = 0; i < nelements; i++) saved[i] = tclusterid[i]; + if (period < INT_MAX / 2) period *= 2; + } + counter++; + + /* Find the center */ + getclustermedoids(nclusters, nelements, distmatrix, tclusterid, + centroids, errors); + + for (i = 0; i < nelements; i++) { + /* Find the closest cluster */ + double distance = DBL_MAX; + for (icluster = 0; icluster < nclusters; icluster++) { + double tdistance; + j = centroids[icluster]; + if (i == j) { + distance = 0.0; + tclusterid[i] = icluster; + break; + } + tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i]; + if (tdistance < distance) { + distance = tdistance; + tclusterid[i] = icluster; + } + } + total += distance; + } + if (total >= previous) break; + /* total >= previous is FALSE on some machines even if total and + * previous are bitwise identical. */ + for (i = 0; i < nelements; i++) + if (saved[i] != tclusterid[i]) break; + if (i == nelements) + break; /* Identical solution found; break out of this loop */ + } + + if (npass <= 1) { + *ifound = 1; + *error = total; + /* Replace by the centroid in each cluster. */ + for (j = 0; j < nelements; j++) { + clusterid[j] = centroids[tclusterid[j]]; + } + break; + } + + for (i = 0; i < nelements; i++) { + if (clusterid[i]!=centroids[tclusterid[i]]) { + if (total < *error) { + *ifound = 1; + *error = total; + /* Replace by the centroid in each cluster. */ + for (j = 0; j < nelements; j++) { + clusterid[j] = centroids[tclusterid[j]]; + } + } + break; + } + } + if (i == nelements) (*ifound)++; /* break statement not encountered */ + } while (++ipass < npass); + + /* Deallocate temporarily used space */ + if (npass > 1) free(tclusterid); + + free(saved); + free(centroids); + free(errors); +} + +/* ******************************************************************** */ + +void +distancematrix(int nrows, int ncolumns, double** data, int** mask, + double weights[], char dist, int transpose, double** matrix) +/* +Purpose +======= + +The distancematrix routine calculates the distance matrix between genes or +samples using their measured gene expression data. Several distance measures +can be used. As the distance matrix is symmetric, with zeros on the diagonal, +only the lower triangular half of the distance matrix is stored. +Space for the distance matrix should be allocated before calling this routine. +If the parameter transpose is set to a nonzero value, the distances between +columns of the data matrix are calculated, otherwise distances between the rows +are calculated. + + +Arguments +========= + +nrows (input) int +The number of rows in the gene expression data matrix (i.e., the number of +genes) + +ncolumns (input) int +The number of columns in the gene expression data matrix (i.e., the number of +samples) + +data (input) double[nrows][ncolumns] +The array containing the gene expression data. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If mask[i][j] == 0, then +data[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +dist (input) char +Defines which distance measure is used, as given by the table: +dist == 'e': Euclidean distance +dist == 'b': City-block distance +dist == 'c': correlation +dist == 'a': absolute value of the correlation +dist == 'u': uncentered correlation +dist == 'x': absolute uncentered correlation +dist == 's': Spearman's rank correlation +dist == 'k': Kendall's tau +For other values of dist, the default (Euclidean distance) is used. + +transpose (input) int +If transpose is equal to zero, the distances between the rows is +calculated. Otherwise, the distances between the columns is calculated. +The former is needed when genes are being clustered; the latter is used +when samples are being clustered. + +distmatrix (output) double** +A ragged array, with the number of columns in each row is equal to the +row index (so distmatrix[i] has i columns). Upon return, the values of +the distance matrix are stored in this array. + + +======================================================================== +*/ +{ + /* First determine the size of the distance matrix */ + const int n = (transpose == 0) ? nrows : ncolumns; + const int ndata = (transpose == 0) ? ncolumns : nrows; + int i, j; + + /* Set the metric function as indicated by dist */ + double (*metric) (int, double**, double**, int**, int**, + const double[], int, int, int) = setmetric(dist); + + /* Calculate the distances and save them in the ragged array */ + for (i = 1; i < n; i++) + for (j = 0; j < i; j++) + matrix[i][j] = metric(ndata, data, data, mask, mask, weights, + i, j, transpose); +} + +/* ******************************************************************** */ + +double* +calculate_weights(int nrows, int ncolumns, double** data, int** mask, + double weights[], int transpose, char dist, double cutoff, double exponent) + +/* +Purpose +======= + +This function calculates the weights using the weighting scheme proposed by +Michael Eisen: +w[i] = 1.0 / sum_{j where d[i][j]= 0) { + clusterid[i] = k; + j = i; + i = previous; + previous = j; + } + else { + j = -i-1; + if (previous == tree[j].left) { + previous = i; + i = tree[j].right; + if (j >= n && (i >= 0 || -i-1 < n)) k++; + } + else if (previous == tree[j].right) { + previous = i; + i = parents[j]; + if (i == nelements) break; + } + else { + parents[j] = previous; + previous = i; + i = tree[j].left; + if (j >= n && (i >= 0 || -i-1 < n)) k++; + } + } + } + free(parents); + return 1; +} + +/* ******************************************************************** */ + +static Node* +pclcluster(int nrows, int ncolumns, double** data, int** mask, double weight[], + double** distmatrix, char dist, int transpose) + +/* + +Purpose +======= + +The pclcluster routine performs clustering using pairwise centroid-linking on a +given set of gene expression data, using the distance metric given by dist. + +Arguments +========= + +nrows (input) int +The number of rows in the gene expression data matrix, equal to the number of +genes. + +ncolumns (input) int +The number of columns in the gene expression data matrix, equal to the number +of samples. + +data (input) double[nrows][ncolumns] +The array containing the gene expression data. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If +mask[i][j] == 0, then data[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0; + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +transpose (input) int +If transpose == 0, the rows of the matrix are clustered. Otherwise, columns +of the matrix are clustered. + +dist (input) char +Defines which distance measure is used, as given by the table: +dist == 'e': Euclidean distance +dist == 'b': City-block distance +dist == 'c': correlation +dist == 'a': absolute value of the correlation +dist == 'u': uncentered correlation +dist == 'x': absolute uncentered correlation +dist == 's': Spearman's rank correlation +dist == 'k': Kendall's tau +For other values of dist, the default (Euclidean distance) is used. + +distmatrix (input) double** +The distance matrix. This matrix is precalculated by the calling routine +treecluster. The pclcluster routine modifies the contents of distmatrix, but +does not deallocate it. + +Return value +============ + +A pointer to a newly allocated array of Node structs, describing the +hierarchical clustering solution consisting of nelements-1 nodes. Depending +on whether genes (rows) or samples (columns) were clustered, nelements is +equal to nrows or ncolumns. See src/cluster.h for a description of the Node +structure. +If a memory error occurs, pclcluster returns NULL. +======================================================================== +*/ +{ + int i, j; + const int nelements = (transpose == 0) ? nrows : ncolumns; + int inode; + const int ndata = transpose ? nrows : ncolumns; + const int nnodes = nelements - 1; + Node* result; + double** newdata; + int** newmask; + int* distid; + + /* Set the metric function as indicated by dist */ + double (*metric) (int, double**, double**, int**, int**, + const double[], int, int, int) = setmetric(dist); + + distid = malloc(nelements*sizeof(int)); + if (!distid) return NULL; + result = malloc(nnodes*sizeof(Node)); + if (!result) { + free(distid); + return NULL; + } + if (!makedatamask(nelements, ndata, &newdata, &newmask)) { + free(result); + free(distid); + return NULL; + } + + for (i = 0; i < nelements; i++) distid[i] = i; + /* To remember which row/column in the distance matrix contains what */ + + /* Storage for node data */ + if (transpose) { + for (i = 0; i < nelements; i++) { + for (j = 0; j < ndata; j++) { + newdata[i][j] = data[j][i]; + newmask[i][j] = mask[j][i]; + } + } + data = newdata; + mask = newmask; + } + else { + for (i = 0; i < nelements; i++) { + memcpy(newdata[i], data[i], ndata*sizeof(double)); + memcpy(newmask[i], mask[i], ndata*sizeof(int)); + } + data = newdata; + mask = newmask; + } + + for (inode = 0; inode < nnodes; inode++) { + /* Find the pair with the shortest distance */ + int is = 1; + int js = 0; + result[inode].distance = find_closest_pair(nelements-inode, distmatrix, + &is, &js); + result[inode].left = distid[js]; + result[inode].right = distid[is]; + + /* Make node js the new node */ + for (i = 0; i < ndata; i++) { + data[js][i] = data[js][i]*mask[js][i] + data[is][i]*mask[is][i]; + mask[js][i] += mask[is][i]; + if (mask[js][i]) data[js][i] /= mask[js][i]; + } + free(data[is]); + free(mask[is]); + data[is] = data[nnodes-inode]; + mask[is] = mask[nnodes-inode]; + + /* Fix the distances */ + distid[is] = distid[nnodes-inode]; + for (i = 0; i < is; i++) + distmatrix[is][i] = distmatrix[nnodes-inode][i]; + for (i = is + 1; i < nnodes-inode; i++) + distmatrix[i][is] = distmatrix[nnodes-inode][i]; + + distid[js] = -inode-1; + for (i = 0; i < js; i++) + distmatrix[js][i] = metric(ndata, data, data, mask, mask, weight, + js, i, 0); + for (i = js + 1; i < nnodes-inode; i++) + distmatrix[i][js] = metric(ndata, data, data, mask, mask, weight, + js, i, 0); + } + + /* Free temporarily allocated space */ + free(data[0]); + free(mask[0]); + free(data); + free(mask); + free(distid); + + return result; +} + +/* ******************************************************************** */ + +static int +nodecompare(const void* a, const void* b) +/* Helper function for qsort. */ +{ + const Node* node1 = (const Node*)a; + const Node* node2 = (const Node*)b; + const double term1 = node1->distance; + const double term2 = node2->distance; + + if (term1 < term2) return -1; + if (term1 > term2) return +1; + return 0; +} + +/* ---------------------------------------------------------------------- */ + +static Node* +pslcluster(int nrows, int ncolumns, double** data, int** mask, + double weight[], double** distmatrix, char dist, int transpose) + +/* + +Purpose +======= + +The pslcluster routine performs single-linkage hierarchical clustering, using +either the distance matrix directly, if available, or by calculating the +distances from the data array. This implementation is based on the SLINK +algorithm, described in: +Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link +cluster method. The Computer Journal, 16(1): 30-34. +The output of this algorithm is identical to conventional single-linkage +hierarchical clustering, but is much more memory-efficient and faster. Hence, +it can be applied to large data sets, for which the conventional single- +linkage algorithm fails due to lack of memory. + + +Arguments +========= + +nrows (input) int +The number of rows in the gene expression data matrix, equal to the number of +genes. + +ncolumns (input) int +The number of columns in the gene expression data matrix, equal to the number +of samples. + +data (input) double[nrows][ncolumns] +The array containing the gene expression data. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If +mask[i][j] == 0, then data[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +transpose (input) int +If transpose == 0, the rows of the matrix are clustered. Otherwise, columns +of the matrix are clustered. + +dist (input) char +Defines which distance measure is used, as given by the table: +dist == 'e': Euclidean distance +dist == 'b': City-block distance +dist == 'c': correlation +dist == 'a': absolute value of the correlation +dist == 'u': uncentered correlation +dist == 'x': absolute uncentered correlation +dist == 's': Spearman's rank correlation +dist == 'k': Kendall's tau +For other values of dist, the default (Euclidean distance) is used. + +distmatrix (input) double** +The distance matrix. If the distance matrix is passed by the calling routine +treecluster, it is used by pslcluster to speed up the clustering calculation. +The pslcluster routine does not modify the contents of distmatrix, and does +not deallocate it. If distmatrix is NULL, the pairwise distances are calculated +by the pslcluster routine from the gene expression data (the data and mask +arrays) and stored in temporary arrays. If distmatrix is passed, the original +gene expression data (specified by the data and mask arguments) are not needed +and are therefore ignored. + + +Return value +============ + +A pointer to a newly allocated array of Node structs, describing the +hierarchical clustering solution consisting of nelements-1 nodes. Depending +on whether genes (rows) or samples (columns) were clustered, nelements is +equal to nrows or ncolumns. See src/cluster.h for a description of the Node +structure. +If a memory error occurs, pslcluster returns NULL. + +======================================================================== +*/ +{ + int i, j, k; + const int nelements = transpose ? ncolumns : nrows; + const int nnodes = nelements - 1; + int* vector; + double* temp; + int* index; + Node* result; + + temp = malloc(nnodes*sizeof(double)); + if (!temp) return NULL; + index = malloc(nelements*sizeof(int)); + if (!index) { + free(temp); + return NULL; + } + vector = malloc(nnodes*sizeof(int)); + if (!vector) { + free(index); + free(temp); + return NULL; + } + result = malloc(nelements*sizeof(Node)); + if (!result) { + free(vector); + free(index); + free(temp); + return NULL; + } + + for (i = 0; i < nnodes; i++) vector[i] = i; + + if (distmatrix) { + for (i = 0; i < nrows; i++) { + result[i].distance = DBL_MAX; + for (j = 0; j < i; j++) temp[j] = distmatrix[i][j]; + for (j = 0; j < i; j++) { + k = vector[j]; + if (result[j].distance >= temp[j]) { + if (result[j].distance < temp[k]) + temp[k] = result[j].distance; + result[j].distance = temp[j]; + vector[j] = i; + } + else if (temp[j] < temp[k]) temp[k] = temp[j]; + } + for (j = 0; j < i; j++) { + if (result[j].distance >= result[vector[j]].distance) + vector[j] = i; + } + } + } + else { + const int ndata = transpose ? nrows : ncolumns; + /* Set the metric function as indicated by dist */ + double (*metric) (int, double**, double**, int**, int**, + const double[], int, int, int) = setmetric(dist); + + for (i = 0; i < nelements; i++) { + result[i].distance = DBL_MAX; + for (j = 0; j < i; j++) temp[j] = + metric(ndata, data, data, mask, mask, weight, i, j, transpose); + for (j = 0; j < i; j++) { + k = vector[j]; + if (result[j].distance >= temp[j]) { + if (result[j].distance < temp[k]) + temp[k] = result[j].distance; + result[j].distance = temp[j]; + vector[j] = i; + } + else if (temp[j] < temp[k]) temp[k] = temp[j]; + } + for (j = 0; j < i; j++) + if (result[j].distance >= result[vector[j]].distance) + vector[j] = i; + } + } + free(temp); + + for (i = 0; i < nnodes; i++) result[i].left = i; + qsort(result, nnodes, sizeof(Node), nodecompare); + + for (i = 0; i < nelements; i++) index[i] = i; + for (i = 0; i < nnodes; i++) { + j = result[i].left; + k = vector[j]; + result[i].left = index[j]; + result[i].right = index[k]; + index[k] = -i-1; + } + free(vector); + free(index); + + result = realloc(result, nnodes*sizeof(Node)); + + return result; +} +/* ******************************************************************** */ + +static Node* +pmlcluster(int nelements, double** distmatrix) +/* + +Purpose +======= + +The pmlcluster routine performs clustering using pairwise maximum- (complete-) +linking on the given distance matrix. + +Arguments +========= + +nelements (input) int +The number of elements to be clustered. + +distmatrix (input) double** +The distance matrix, with nelements rows, each row being filled up to the +diagonal. The elements on the diagonal are not used, as they are assumed to be +zero. The distance matrix will be modified by this routine. + +Return value +============ + +A pointer to a newly allocated array of Node structs, describing the +hierarchical clustering solution consisting of nelements-1 nodes. Depending on +whether genes (rows) or samples (columns) were clustered, nelements is equal +to nrows or ncolumns. See src/cluster.h for a description of the Node +structure. +If a memory error occurs, pmlcluster returns NULL. +======================================================================== +*/ +{ + int j; + int n; + int* clusterid; + Node* result; + + clusterid = malloc(nelements*sizeof(int)); + if (!clusterid) return NULL; + result = malloc((nelements-1)*sizeof(Node)); + if (!result) { + free(clusterid); + return NULL; + } + + /* Setup a list specifying to which cluster a gene belongs */ + for (j = 0; j < nelements; j++) clusterid[j] = j; + + for (n = nelements; n > 1; n--) { + int is = 1; + int js = 0; + + result[nelements-n].distance = find_closest_pair(n, distmatrix, + &is, &js); + + /* Fix the distances */ + for (j = 0; j < js; j++) + distmatrix[js][j] = max(distmatrix[is][j], distmatrix[js][j]); + for (j = js+1; j < is; j++) + distmatrix[j][js] = max(distmatrix[is][j], distmatrix[j][js]); + for (j = is+1; j < n; j++) + distmatrix[j][js] = max(distmatrix[j][is], distmatrix[j][js]); + + for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j]; + for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j]; + + /* Update clusterids */ + result[nelements-n].left = clusterid[is]; + result[nelements-n].right = clusterid[js]; + clusterid[js] = n-nelements-1; + clusterid[is] = clusterid[n-1]; + } + free(clusterid); + + return result; +} + +/* ******************************************************************* */ + +static Node* +palcluster(int nelements, double** distmatrix) +/* +Purpose +======= + +The palcluster routine performs clustering using pairwise average +linking on the given distance matrix. + +Arguments +========= + +nelements (input) int +The number of elements to be clustered. + +distmatrix (input) double** +The distance matrix, with nelements rows, each row being filled up to the +diagonal. The elements on the diagonal are not used, as they are assumed to be +zero. The distance matrix will be modified by this routine. + +Return value +============ + +A pointer to a newly allocated array of Node structs, describing the +hierarchical clustering solution consisting of nelements-1 nodes. Depending on +whether genes (rows) or samples (columns) were clustered, nelements is equal +to nrows or ncolumns. See src/cluster.h for a description of the Node +structure. +If a memory error occurs, palcluster returns NULL. +======================================================================== +*/ +{ + int j; + int n; + int* clusterid; + int* number; + Node* result; + + clusterid = malloc(nelements*sizeof(int)); + if (!clusterid) return NULL; + number = malloc(nelements*sizeof(int)); + if (!number) { + free(clusterid); + return NULL; + } + result = malloc((nelements-1)*sizeof(Node)); + if (!result) { + free(clusterid); + free(number); + return NULL; + } + + /* Setup a list specifying to which cluster a gene belongs, and keep track + * of the number of elements in each cluster (needed to calculate the + * average). */ + for (j = 0; j < nelements; j++) { + number[j] = 1; + clusterid[j] = j; + } + + for (n = nelements; n > 1; n--) { + int sum; + int is = 1; + int js = 0; + result[nelements-n].distance = find_closest_pair(n, distmatrix, + &is, &js); + + /* Save result */ + result[nelements-n].left = clusterid[is]; + result[nelements-n].right = clusterid[js]; + + /* Fix the distances */ + sum = number[is] + number[js]; + for (j = 0; j < js; j++) { + distmatrix[js][j] = distmatrix[is][j]*number[is] + + distmatrix[js][j]*number[js]; + distmatrix[js][j] /= sum; + } + for (j = js+1; j < is; j++) { + distmatrix[j][js] = distmatrix[is][j]*number[is] + + distmatrix[j][js]*number[js]; + distmatrix[j][js] /= sum; + } + for (j = is+1; j < n; j++) { + distmatrix[j][js] = distmatrix[j][is]*number[is] + + distmatrix[j][js]*number[js]; + distmatrix[j][js] /= sum; + } + + for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j]; + for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j]; + + /* Update number of elements in the clusters */ + number[js] = sum; + number[is] = number[n-1]; + + /* Update clusterids */ + clusterid[js] = n-nelements-1; + clusterid[is] = clusterid[n-1]; + } + free(clusterid); + free(number); + + return result; +} + +/* ******************************************************************* */ + +Node* +treecluster(int nrows, int ncolumns, double** data, int** mask, + double weight[], int transpose, char dist, char method, + double** distmatrix) +/* +Purpose +======= + +The treecluster routine performs hierarchical clustering using pairwise +single-, maximum-, centroid-, or average-linkage, as defined by method, on a +given set of gene expression data, using the distance metric given by dist. +If successful, the function returns a pointer to a newly allocated Tree struct +containing the hierarchical clustering solution, and NULL if a memory error +occurs. The pointer should be freed by the calling routine to prevent memory +leaks. + +Arguments +========= + +nrows (input) int +The number of rows in the data matrix, equal to the number of genes. + +ncolumns (input) int +The number of columns in the data matrix, equal to the number of samples. + +data (input) double[nrows][ncolumns] +The array containing the data of the vectors to be clustered. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If mask[i][j] == 0, then +data[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0, + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +transpose (input) int +If transpose == 0, the rows of the matrix are clustered. Otherwise, columns +of the matrix are clustered. + +dist (input) char +Defines which distance measure is used, as given by the table: +dist == 'e': Euclidean distance +dist == 'b': City-block distance +dist == 'c': correlation +dist == 'a': absolute value of the correlation +dist == 'u': uncentered correlation +dist == 'x': absolute uncentered correlation +dist == 's': Spearman's rank correlation +dist == 'k': Kendall's tau +For other values of dist, the default (Euclidean distance) is used. + +method (input) char +Defines which hierarchical clustering method is used: +method == 's': pairwise single-linkage clustering +method == 'm': pairwise maximum- (or complete-) linkage clustering +method == 'a': pairwise average-linkage clustering +method == 'c': pairwise centroid-linkage clustering +For the first three, either the distance matrix or the gene expression data is +sufficient to perform the clustering algorithm. For pairwise centroid-linkage +clustering, however, the gene expression data are always needed, even if the +distance matrix itself is available. + +distmatrix (input) double** +The distance matrix. If the distance matrix is zero initially, the distance +matrix will be allocated and calculated from the data by treecluster, and +deallocated before treecluster returns. If the distance matrix is passed by the +calling routine, treecluster will modify the contents of the distance matrix as +part of the clustering algorithm, but will not deallocate it. The calling +routine should deallocate the distance matrix after the return from +treecluster. + +Return value +============ + +A pointer to a newly allocated array of Node structs, describing the +hierarchical clustering solution consisting of nelements-1 nodes. Depending on +whether genes (rows) or samples (columns) were clustered, nelements is equal +to nrows or ncolumns. See src/cluster.h for a description of the Node +structure. +If a memory error occurs, treecluster returns NULL. + +======================================================================== +*/ +{ + Node* result = NULL; + const int nelements = (transpose == 0) ? nrows : ncolumns; + const int ldistmatrix = (distmatrix == NULL && method != 's') ? 1 : 0; + + if (nelements < 2) return NULL; + + /* Calculate the distance matrix if the user didn't give it */ + if (ldistmatrix) { + /* Set up the ragged array */ + int i; + distmatrix = malloc(nelements*sizeof(double*)); + if (distmatrix == NULL) return NULL; /* Not enough memory available */ + distmatrix[0] = NULL; + for (i = 1; i < nelements; i++) { + distmatrix[i] = malloc(i*sizeof(double)); + if (distmatrix[i] == NULL) /* Not enough memory available */ { + while (--i > 0) free(distmatrix[i]); + free(distmatrix); + return NULL; + } + } + distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose, + distmatrix); + } + + switch(method) { + case 's': + result = pslcluster(nrows, ncolumns, data, mask, weight, + distmatrix, dist, transpose); + break; + case 'm': + result = pmlcluster(nelements, distmatrix); + break; + case 'a': + result = palcluster(nelements, distmatrix); + break; + case 'c': + result = pclcluster(nrows, ncolumns, data, mask, weight, + distmatrix, dist, transpose); + break; + } + + /* Deallocate space for distance matrix if allocated by treecluster */ + if (ldistmatrix) { + int i; + for (i = 1; i < nelements; i++) free(distmatrix[i]); + free(distmatrix); + } + + return result; +} + +/* ******************************************************************* */ + +int +sorttree(const int nnodes, Node* tree, const double order[], int indices[]) +/* +Purpose +======= + +The sorttree routine sorts the items in a hierarchical clustering solution +based on their order values, while remaining consistent with the hierchical +clustering solution. + +Arguments +========= + +nnodes (input) int +The number of nodes in the hierarchical clustering tree. + +tree (input) Node[nnodes] +The hierarchical clustering tree describing the clustering solution. + +order (input) double[nnodes+1] +The preferred order of the items. + +indices (output) int* +The indices of each item after sorting, with item i appearing at indices[i] +after sorting. + +Return value +============ + +If no errors occur, sorttree returns 1. +If a memory error occurs, sorttree returns 0. + +======================================================================== +*/ + +{ + int i; + int index; + int i1, i2; + double order1, order2; + int counts1, counts2; + int* nodecounts; + + nodecounts = malloc(nnodes*sizeof(int)); + if (!nodecounts) return 0; + if (order) { + double* nodeorder = malloc(nnodes*sizeof(double)); + if (!nodeorder) { + free(nodecounts); + return 0; + } + for (i = 0; i < nnodes; i++) { + i1 = tree[i].left; + i2 = tree[i].right; + /* i1 and i2 are the elements that are to be joined */ + if (i1 < 0) { + index = -i1-1; + order1 = nodeorder[index]; + counts1 = nodecounts[index]; + } + else { + order1 = order[i1]; + counts1 = 1; + } + if (i2 < 0) { + index = -i2-1; + order2 = nodeorder[index]; + counts2 = nodecounts[index]; + } + else { + order2 = order[i2]; + counts2 = 1; + } + if (order1 > order2) { + tree[i].left = i2; + tree[i].right = i1; + } + nodecounts[i] = counts1 + counts2; + nodeorder[i] = (counts1*order1+counts2*order2) / (counts1+counts2); + } + free(nodeorder); + } + else { + for (i = 0; i < nnodes; i++) { + i1 = tree[i].left; + i2 = tree[i].right; + /* i1 and i2 are the elements that are to be joined */ + counts1 = (i1 < 0) ? nodecounts[-i1-1] : 1; + counts2 = (i2 < 0) ? nodecounts[-i2-1] : 1; + nodecounts[i] = counts1 + counts2; + } + } + i--; + nodecounts[i] = 0; + for ( ; i >= 0; i--) { + i1 = tree[i].left; + i2 = tree[i].right; + counts1 = (i1<0) ? nodecounts[-i1-1] : 1; + index = nodecounts[i]; + if (i1 >= 0) indices[index] = i1; + else nodecounts[-i1-1] = index; + index += counts1; + if (i2 >= 0) indices[index] = i2; + else nodecounts[-i2-1] = index; + } + free(nodecounts); + return 1; +} + +/* ******************************************************************* */ + +static void +somworker(int nrows, int ncolumns, double** data, int** mask, + const double weights[], int transpose, int nxgrid, int nygrid, + double inittau, double*** celldata, int niter, char dist) + +{ + const int nelements = (transpose == 0) ? nrows : ncolumns; + const int ndata = (transpose == 0) ? ncolumns : nrows; + int i, j; + int** dummymask; + int ix, iy; + int* index; + int iter; + /* Maximum radius in which nodes are adjusted */ + double maxradius = sqrt(nxgrid*nxgrid+nygrid*nygrid); + double* stddata = calloc(nelements, sizeof(double)); + + /* Set the metric function as indicated by dist */ + double (*metric) (int, double**, double**, int**, int**, + const double[], int, int, int) = setmetric(dist); + + /* Calculate the standard deviation for each row or column */ + if (transpose == 0) { + for (i = 0; i < nelements; i++) { + int n = 0; + for (j = 0; j < ndata; j++) { + if (mask[i][j]) { + double term = data[i][j]; + term = term * term; + stddata[i] += term; + n++; + } + } + if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n); + else stddata[i] = 1; + } + } + else { + for (i = 0; i < nelements; i++) { + int n = 0; + for (j = 0; j < ndata; j++) { + if (mask[j][i]) { + double term = data[j][i]; + term = term * term; + stddata[i] += term; + n++; + } + } + if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n); + else stddata[i] = 1; + } + } + + if (transpose == 0) { + dummymask = malloc(nygrid*sizeof(int*)); + for (i = 0; i < nygrid; i++) { + dummymask[i] = malloc(ndata*sizeof(int)); + for (j = 0; j < ndata; j++) dummymask[i][j] = 1; + } + } + else { + dummymask = malloc(ndata*sizeof(int*)); + for (i = 0; i < ndata; i++) { + dummymask[i] = malloc(sizeof(int)); + dummymask[i][0] = 1; + } + } + + /* Randomly initialize the nodes */ + for (ix = 0; ix < nxgrid; ix++) { + for (iy = 0; iy < nygrid; iy++) { + double sum = 0.; + for (i = 0; i < ndata; i++) { + double term = -1.0 + 2.0*uniform(); + celldata[ix][iy][i] = term; + sum += term * term; + } + sum = sqrt(sum/ndata); + for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum; + } + } + + /* Randomize the order in which genes or arrays will be used */ + index = malloc(nelements*sizeof(int)); + for (i = 0; i < nelements; i++) index[i] = i; + for (i = 0; i < nelements; i++) { + j = (int) (i + (nelements-i)*uniform()); + ix = index[j]; + index[j] = index[i]; + index[i] = ix; + } + + /* Start the iteration */ + for (iter = 0; iter < niter; iter++) { + int ixbest = 0; + int iybest = 0; + int iobject = iter % nelements; + iobject = index[iobject]; + if (transpose == 0) { + double closest = metric(ndata, data, celldata[ixbest], mask, + dummymask, weights, iobject, iybest, + transpose); + double radius = maxradius * (1. - ((double)iter)/((double)niter)); + double tau = inittau * (1. - ((double)iter)/((double)niter)); + + for (ix = 0; ix < nxgrid; ix++) { + for (iy = 0; iy < nygrid; iy++) { + double distance = metric(ndata, data, celldata[ix], mask, + dummymask, weights, iobject, iy, + transpose); + if (distance < closest) { + ixbest = ix; + iybest = iy; + closest = distance; + } + } + } + for (ix = 0; ix < nxgrid; ix++) { + for (iy = 0; iy < nygrid; iy++) { + if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest)) < + radius) { + double sum = 0.; + for (i = 0; i < ndata; i++) { + if (mask[iobject][i] == 0) continue; + celldata[ix][iy][i] += + tau * (data[iobject][i]/stddata[iobject] + -celldata[ix][iy][i]); + } + for (i = 0; i < ndata; i++) { + double term = celldata[ix][iy][i]; + term = term * term; + sum += term; + } + if (sum>0) { + sum = sqrt(sum/ndata); + for (i = 0; i < ndata; i++) + celldata[ix][iy][i] /= sum; + } + } + } + } + } + else { + double closest; + double** celldatavector = malloc(ndata*sizeof(double*)); + double radius = maxradius * (1. - ((double)iter)/((double)niter)); + double tau = inittau * (1. - ((double)iter)/((double)niter)); + + for (i = 0; i < ndata; i++) + celldatavector[i] = &(celldata[ixbest][iybest][i]); + closest = metric(ndata, data, celldatavector, mask, dummymask, + weights, iobject, 0, transpose); + for (ix = 0; ix < nxgrid; ix++) { + for (iy = 0; iy < nygrid; iy++) { + double distance; + for (i = 0; i < ndata; i++) + celldatavector[i] = &(celldata[ixbest][iybest][i]); + distance = metric(ndata, data, celldatavector, mask, + dummymask, weights, iobject, 0, + transpose); + if (distance < closest) { + ixbest = ix; + iybest = iy; + closest = distance; + } + } + } + free(celldatavector); + for (ix = 0; ix < nxgrid; ix++) { + for (iy = 0; iy < nygrid; iy++) { + if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest)) < + radius) { + double sum = 0.; + for (i = 0; i < ndata; i++) { + if (mask[i][iobject] == 0) continue; + celldata[ix][iy][i] += + tau * (data[i][iobject]/stddata[iobject] + -celldata[ix][iy][i]); + } + for (i = 0; i < ndata; i++) { + double term = celldata[ix][iy][i]; + term = term * term; + sum += term; + } + if (sum>0) { + sum = sqrt(sum/ndata); + for (i = 0; i < ndata; i++) + celldata[ix][iy][i] /= sum; + } + } + } + } + } + } + if (transpose == 0) + for (i = 0; i < nygrid; i++) free(dummymask[i]); + else + for (i = 0; i < ndata; i++) free(dummymask[i]); + free(dummymask); + free(stddata); + free(index); +} + +/* ******************************************************************* */ + +static void +somassign(int nrows, int ncolumns, double** data, int** mask, + const double weights[], int transpose, int nxgrid, int nygrid, + double*** celldata, char dist, int clusterid[][2]) +/* Collect clusterids */ +{ + const int ndata = (transpose == 0) ? ncolumns : nrows; + int i, j; + + /* Set the metric function as indicated by dist */ + double (*metric) (int, double**, double**, int**, int**, + const double[], int, int, int) = setmetric(dist); + + if (transpose == 0) { + int** dummymask = malloc(nygrid*sizeof(int*)); + for (i = 0; i < nygrid; i++) { + dummymask[i] = malloc(ncolumns*sizeof(int)); + for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1; + } + for (i = 0; i < nrows; i++) { + int ixbest = 0; + int iybest = 0; + double closest = metric(ndata, data, celldata[ixbest], mask, + dummymask, weights, i, iybest, transpose); + int ix, iy; + for (ix = 0; ix < nxgrid; ix++) { + for (iy = 0; iy < nygrid; iy++) { + double distance = metric(ndata, data, celldata[ix], mask, + dummymask, weights, i, iy, + transpose); + if (distance < closest) { + ixbest = ix; + iybest = iy; + closest = distance; + } + } + } + clusterid[i][0] = ixbest; + clusterid[i][1] = iybest; + } + for (i = 0; i < nygrid; i++) free(dummymask[i]); + free(dummymask); + } + else { + double** celldatavector = malloc(ndata*sizeof(double*)); + int** dummymask = malloc(nrows*sizeof(int*)); + int ixbest = 0; + int iybest = 0; + for (i = 0; i < nrows; i++) { + dummymask[i] = malloc(sizeof(int)); + dummymask[i][0] = 1; + } + for (i = 0; i < ncolumns; i++) { + double closest; + int ix, iy; + for (j = 0; j < ndata; j++) + celldatavector[j] = &(celldata[ixbest][iybest][j]); + closest = metric(ndata, data, celldatavector, mask, dummymask, + weights, i, 0, transpose); + for (ix = 0; ix < nxgrid; ix++) { + for (iy = 0; iy < nygrid; iy++) { + double distance; + for (j = 0; j < ndata; j++) + celldatavector[j] = &(celldata[ix][iy][j]); + distance = metric(ndata, data, celldatavector, mask, + dummymask, weights, i, 0, transpose); + if (distance < closest) { + ixbest = ix; + iybest = iy; + closest = distance; + } + } + } + clusterid[i][0] = ixbest; + clusterid[i][1] = iybest; + } + free(celldatavector); + for (i = 0; i < nrows; i++) free(dummymask[i]); + free(dummymask); + } +} + +/* ******************************************************************* */ + +void +somcluster(int nrows, int ncolumns, double** data, int** mask, + const double weight[], int transpose, int nxgrid, int nygrid, + double inittau, int niter, char dist, double*** celldata, + int clusterid[][2]) +/* + +Purpose +======= + +The somcluster routine implements a self-organizing map (Kohonen) on a +rectangular grid, using a given set of vectors. The distance measure to be +used to find the similarity between genes and nodes is given by dist. + +Arguments +========= + +nrows (input) int +The number of rows in the data matrix, equal to the number of genes. + +ncolumns (input) int +The number of columns in the data matrix, equal to the number of samples. + +data (input) double[nrows][ncolumns] +The array containing the gene expression data. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If +mask[i][j] == 0, then data[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0; + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +transpose (input) int +If transpose == 0, the rows (genes) of the matrix are clustered. Otherwise, +columns (samples) of the matrix are clustered. + +nxgrid (input) int +The number of grid cells horizontally in the rectangular topology of clusters. + +nygrid (input) int +The number of grid cells horizontally in the rectangular topology of clusters. + +inittau (input) double +The initial value of tau, representing the neighborhood function. + +niter (input) int +The number of iterations to be performed. + +dist (input) char +Defines which distance measure is used, as given by the table: +dist == 'e': Euclidean distance +dist == 'b': City-block distance +dist == 'c': correlation +dist == 'a': absolute value of the correlation +dist == 'u': uncentered correlation +dist == 'x': absolute uncentered correlation +dist == 's': Spearman's rank correlation +dist == 'k': Kendall's tau +For other values of dist, the default (Euclidean distance) is used. + +celldata (output) double[nxgrid][nygrid][ncolumns] if transpose == 0; + double[nxgrid][nygrid][nrows] otherwise +The gene expression data for each node (cell) in the 2D grid. This can be +interpreted as the centroid for the cluster corresponding to that cell. If +celldata is NULL, then the centroids are not returned. If celldata is not +NULL, enough space should be allocated to store the centroid data before +calling somcluster. + +clusterid (output) int[nrows][2] if transpose == 0; + int[ncolumns][2] otherwise +For each item (gene or microarray) that is clustered, the coordinates of the +cell in the 2D grid to which the item was assigned. If clusterid is NULL, the +cluster assignments are not returned. If clusterid is not NULL, enough memory +should be allocated to store the clustering information before calling +somcluster. + +======================================================================== +*/ +{ + const int nobjects = (transpose == 0) ? nrows : ncolumns; + const int ndata = (transpose == 0) ? ncolumns : nrows; + int i, j; + const int lcelldata = (celldata == NULL) ? 0 : 1; + + if (nobjects < 2) return; + + if (lcelldata == 0) { + celldata = malloc(nxgrid*nygrid*ndata*sizeof(double**)); + for (i = 0; i < nxgrid; i++) { + celldata[i] = malloc(nygrid*ndata*sizeof(double*)); + for (j = 0; j < nygrid; j++) + celldata[i][j] = malloc(ndata*sizeof(double)); + } + } + + somworker(nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid, + inittau, celldata, niter, dist); + if (clusterid) + somassign(nrows, ncolumns, data, mask, weight, transpose, + nxgrid, nygrid, celldata, dist, clusterid); + if (lcelldata == 0) { + for (i = 0; i < nxgrid; i++) + for (j = 0; j < nygrid; j++) + free(celldata[i][j]); + for (i = 0; i < nxgrid; i++) + free(celldata[i]); + free(celldata); + } +} + +/* ******************************************************************** */ + +double +clusterdistance(int nrows, int ncolumns, double** data, int** mask, + double weight[], int n1, int n2, int index1[], int index2[], + char dist, char method, int transpose) + +/* +Purpose +======= + +The clusterdistance routine calculates the distance between two clusters +containing genes or samples using the measured gene expression vectors. The +distance between clusters, given the genes/samples in each cluster, can be +defined in several ways. Several distance measures can be used. + +The routine returns the distance in double precision. +If the parameter transpose is set to a nonzero value, the clusters are +interpreted as clusters of samples, otherwise as clusters of gene. + +Arguments +========= + +nrows (input) int +The number of rows (i.e., the number of genes) in the gene expression data +matrix. + +ncolumns (input) int +The number of columns (i.e., the number of samples) in the gene expression +data matrix. + +data (input) double[nrows][ncolumns] +The array containing the data of the vectors. + +mask (input) int[nrows][ncolumns] +This array shows which data values are missing. If mask[i][j] == 0, then +data[i][j] is missing. + +weight (input) double[ncolumns] if transpose == 0; + double[nrows] otherwise +The weights that are used to calculate the distance. This is equivalent +to including the jth data point weight[j] times in the calculation. The +weights can be non-integer. + +n1 (input) int +The number of elements in the first cluster. + +n2 (input) int +The number of elements in the second cluster. + +index1 (input) int[n1] +Identifies which genes/samples belong to the first cluster. + +index2 (input) int[n2] +Identifies which genes/samples belong to the second cluster. + +dist (input) char +Defines which distance measure is used, as given by the table: +dist == 'e': Euclidean distance +dist == 'b': City-block distance +dist == 'c': correlation +dist == 'a': absolute value of the correlation +dist == 'u': uncentered correlation +dist == 'x': absolute uncentered correlation +dist == 's': Spearman's rank correlation +dist == 'k': Kendall's tau +For other values of dist, the default (Euclidean distance) is used. + +method (input) char +Defines how the distance between two clusters is defined, given which genes +belong to which cluster: +method == 'a': the distance between the arithmetic means of the two clusters +method == 'm': the distance between the medians of the two clusters +method == 's': the smallest pairwise distance between members of the two + clusters +method == 'x': the largest pairwise distance between members of the two + clusters +method == 'v': average of the pairwise distances between members of the two + clusters + +transpose (input) int +If transpose is equal to zero, the distances between the rows is +calculated. Otherwise, the distances between the columns is calculated. +The former is needed when genes are being clustered; the latter is used +when samples are being clustered. + +======================================================================== +*/ +{ + /* Set the metric function as indicated by dist */ + double (*metric) (int, double**, double**, int**, int**, + const double[], int, int, int) = setmetric(dist); + + /* if one or both clusters are empty, return */ + if (n1 < 1 || n2 < 1) return -1.0; + /* Check the indices */ + if (transpose == 0) { + int i; + for (i = 0; i < n1; i++) { + int index = index1[i]; + if (index < 0 || index >= nrows) return -1.0; + } + for (i = 0; i < n2; i++) { + int index = index2[i]; + if (index < 0 || index >= nrows) return -1.0; + } + } + else { + int i; + for (i = 0; i < n1; i++) { + int index = index1[i]; + if (index < 0 || index >= ncolumns) return -1.0; + } + for (i = 0; i < n2; i++) { + int index = index2[i]; + if (index < 0 || index >= ncolumns) return -1.0; + } + } + + switch (method) { + case 'a': { + /* Find the center */ + int i, j, k; + if (transpose == 0) { + double distance; + double* cdata[2]; + int* cmask[2]; + int* count[2]; + count[0] = calloc(ncolumns, sizeof(int)); + count[1] = calloc(ncolumns, sizeof(int)); + cdata[0] = calloc(ncolumns, sizeof(double)); + cdata[1] = calloc(ncolumns, sizeof(double)); + cmask[0] = malloc(ncolumns*sizeof(int)); + cmask[1] = malloc(ncolumns*sizeof(int)); + for (i = 0; i < n1; i++) { + k = index1[i]; + for (j = 0; j < ncolumns; j++) + if (mask[k][j] != 0) { + cdata[0][j] = cdata[0][j] + data[k][j]; + count[0][j] = count[0][j] + 1; + } + } + for (i = 0; i < n2; i++) { + k = index2[i]; + for (j = 0; j < ncolumns; j++) + if (mask[k][j] != 0) { + cdata[1][j] = cdata[1][j] + data[k][j]; + count[1][j] = count[1][j] + 1; + } + } + for (i = 0; i < 2; i++) + for (j = 0; j < ncolumns; j++) { + if (count[i][j]>0) { + cdata[i][j] = cdata[i][j] / count[i][j]; + cmask[i][j] = 1; + } + else + cmask[i][j] = 0; + } + distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight, + 0, 1, 0); + for (i = 0; i < 2; i++) { + free(cdata[i]); + free(cmask[i]); + free(count[i]); + } + return distance; + } + else { + double distance; + int** count = malloc(nrows*sizeof(int*)); + double** cdata = malloc(nrows*sizeof(double*)); + int** cmask = malloc(nrows*sizeof(int*)); + for (i = 0; i < nrows; i++) { + count[i] = calloc(2, sizeof(int)); + cdata[i] = calloc(2, sizeof(double)); + cmask[i] = malloc(2*sizeof(int)); + } + for (i = 0; i < n1; i++) { + k = index1[i]; + for (j = 0; j < nrows; j++) { + if (mask[j][k] != 0) { + cdata[j][0] += data[j][k]; + count[j][0]++; + } + } + } + for (i = 0; i < n2; i++) { + k = index2[i]; + for (j = 0; j < nrows; j++) { + if (mask[j][k] != 0) { + cdata[j][1] += data[j][k]; + count[j][1]++; + } + } + } + for (i = 0; i < nrows; i++) + for (j = 0; j < 2; j++) + if (count[i][j]>0) { + cdata[i][j] /= count[i][j]; + cmask[i][j] = 1; + } + else + cmask[i][j] = 0; + distance = metric(nrows, cdata, cdata, cmask, cmask, weight, + 0, 1, 1); + for (i = 0; i < nrows; i++) { + free(count[i]); + free(cdata[i]); + free(cmask[i]); + } + free(count); + free(cdata); + free(cmask); + return distance; + } + } + case 'm': { + int i, j, k; + if (transpose == 0) { + double distance; + double* temp = malloc(nrows*sizeof(double)); + double* cdata[2]; + int* cmask[2]; + for (i = 0; i < 2; i++) { + cdata[i] = malloc(ncolumns*sizeof(double)); + cmask[i] = malloc(ncolumns*sizeof(int)); + } + for (j = 0; j < ncolumns; j++) { + int count = 0; + for (k = 0; k < n1; k++) { + i = index1[k]; + if (mask[i][j]) { + temp[count] = data[i][j]; + count++; + } + } + if (count>0) { + cdata[0][j] = median(count, temp); + cmask[0][j] = 1; + } + else { + cdata[0][j] = 0.; + cmask[0][j] = 0; + } + } + for (j = 0; j < ncolumns; j++) { + int count = 0; + for (k = 0; k < n2; k++) { + i = index2[k]; + if (mask[i][j]) { + temp[count] = data[i][j]; + count++; + } + } + if (count>0) { + cdata[1][j] = median(count, temp); + cmask[1][j] = 1; + } + else { + cdata[1][j] = 0.; + cmask[1][j] = 0; + } + } + distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight, + 0, 1, 0); + for (i = 0; i < 2; i++) { + free(cdata[i]); + free(cmask[i]); + } + free(temp); + return distance; + } + else { + double distance; + double* temp = malloc(ncolumns*sizeof(double)); + double** cdata = malloc(nrows*sizeof(double*)); + int** cmask = malloc(nrows*sizeof(int*)); + for (i = 0; i < nrows; i++) { + cdata[i] = malloc(2*sizeof(double)); + cmask[i] = malloc(2*sizeof(int)); + } + for (j = 0; j < nrows; j++) { + int count = 0; + for (k = 0; k < n1; k++) { + i = index1[k]; + if (mask[j][i]) { + temp[count] = data[j][i]; + count++; + } + } + if (count>0) { + cdata[j][0] = median(count, temp); + cmask[j][0] = 1; + } + else { + cdata[j][0] = 0.; + cmask[j][0] = 0; + } + } + for (j = 0; j < nrows; j++) { + int count = 0; + for (k = 0; k < n2; k++) { + i = index2[k]; + if (mask[j][i]) { + temp[count] = data[j][i]; + count++; + } + } + if (count>0) { + cdata[j][1] = median(count, temp); + cmask[j][1] = 1; + } + else { + cdata[j][1] = 0.; + cmask[j][1] = 0; + } + } + distance = metric(nrows, cdata, cdata, cmask, cmask, weight, + 0, 1, 1); + for (i = 0; i < nrows; i++) { + free(cdata[i]); + free(cmask[i]); + } + free(cdata); + free(cmask); + free(temp); + return distance; + } + } + case 's': { + int i1, i2, j1, j2; + const int n = (transpose == 0) ? ncolumns : nrows; + double mindistance = DBL_MAX; + for (i1 = 0; i1 < n1; i1++) + for (i2 = 0; i2 < n2; i2++) { + double distance; + j1 = index1[i1]; + j2 = index2[i2]; + distance = metric(n, data, data, mask, mask, weight, + j1, j2, transpose); + if (distance < mindistance) mindistance = distance; + } + return mindistance; + } + case 'x': { + int i1, i2, j1, j2; + const int n = (transpose == 0) ? ncolumns : nrows; + double maxdistance = 0; + for (i1 = 0; i1 < n1; i1++) + for (i2 = 0; i2 < n2; i2++) { + double distance; + j1 = index1[i1]; + j2 = index2[i2]; + distance = metric(n, data, data, mask, mask, weight, + j1, j2, transpose); + if (distance > maxdistance) maxdistance = distance; + } + return maxdistance; + } + case 'v': { + int i1, i2, j1, j2; + const int n = (transpose == 0) ? ncolumns : nrows; + double distance = 0; + for (i1 = 0; i1 < n1; i1++) + for (i2 = 0; i2 < n2; i2++) { + j1 = index1[i1]; + j2 = index2[i2]; + distance += metric(n, data, data, mask, mask, weight, + j1, j2, transpose); + } + distance /= (n1*n2); + return distance; + } + } + /* Never get here */ + return -2.0; +} diff --git a/code/lib/Bio/Cluster/cluster.h b/code/lib/Bio/Cluster/cluster.h new file mode 100644 index 0000000..fbbfd26 --- /dev/null +++ b/code/lib/Bio/Cluster/cluster.h @@ -0,0 +1,90 @@ +/******************************************************************************/ +/* The C Clustering Library. + * Copyright (C) 2002 Michiel Jan Laurens de Hoon. + * + * This library was written at the Laboratory of DNA Information Analysis, + * Human Genome Center, Institute of Medical Science, University of Tokyo, + * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan. + * Contact: michiel.dehoon 'AT' riken.jp + * + * Permission to use, copy, modify, and distribute this software and its + * documentation with or without modifications and for any purpose and + * without fee is hereby granted, provided that any copyright notices + * appear in all copies and that both those copyright notices and this + * permission notice appear in supporting documentation, and that the + * names of the contributors or copyright holders not be used in + * advertising or publicity pertaining to distribution of the software + * without specific prior permission. + * + * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT + * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#ifndef min +#define min(x, y) ((x) < (y) ? (x) : (y)) +#endif +#ifndef max +#define max(x, y) ((x) > (y) ? (x) : (y)) +#endif + +#define CLUSTERVERSION "1.59" + +/* Chapter 2 */ +double clusterdistance(int nrows, int ncolumns, double** data, int** mask, + double weight[], int n1, int n2, int index1[], int index2[], char dist, + char method, int transpose); +void distancematrix(int ngenes, int ndata, double** data, int** mask, + double* weight, char dist, int transpose, double** distances); + +/* Chapter 3 */ +int getclustercentroids(int nclusters, int nrows, int ncolumns, + double** data, int** mask, int clusterid[], double** cdata, int** cmask, + int transpose, char method); +void getclustermedoids(int nclusters, int nelements, double** distance, + int clusterid[], int centroids[], double errors[]); +void kcluster(int nclusters, int ngenes, int ndata, double** data, + int** mask, double weight[], int transpose, int npass, char method, char dist, + int clusterid[], double* error, int* ifound); +void kmedoids(int nclusters, int nelements, double** distance, + int npass, int clusterid[], double* error, int* ifound); + +/* Chapter 4 */ +typedef struct {int left; int right; double distance;} Node; +/* + * A Node struct describes a single node in a tree created by hierarchical + * clustering. The tree can be represented by an array of n Node structs, + * where n is the number of elements minus one. The integers left and right + * in each Node struct refer to the two elements or subnodes that are joined + * in this node. The original elements are numbered 0..nelements-1, and the + * nodes -1..-(nelements-1). For each node, distance contains the distance + * between the two subnodes that were joined. + */ + +Node* treecluster(int nrows, int ncolumns, double** data, int** mask, + double weight[], int transpose, char dist, char method, double** distmatrix); +int sorttree(const int nnodes, Node* tree, const double order[], int indices[]); +int cuttree(int nelements, const Node* tree, int nclusters, int clusterid[]); + +/* Chapter 5 */ +void somcluster(int nrows, int ncolumns, double** data, int** mask, + const double weight[], int transpose, int nxnodes, int nynodes, + double inittau, int niter, char dist, double*** celldata, + int clusterid[][2]); + +/* Chapter 6 */ +int pca(int m, int n, double** u, double** v, double* w); + +/* Utility routines, currently undocumented */ +void sort(int n, const double data[], int index[]); +double mean(int n, double x[]); +double median (int n, double x[]); + +double* calculate_weights(int nrows, int ncolumns, double** data, int** mask, + double weights[], int transpose, char dist, double cutoff, double exponent); diff --git a/code/lib/Bio/Cluster/clustermodule.c b/code/lib/Bio/Cluster/clustermodule.c new file mode 100644 index 0000000..29b2a5c --- /dev/null +++ b/code/lib/Bio/Cluster/clustermodule.c @@ -0,0 +1,2457 @@ +#include "Python.h" +#include +#include +#include +#include "cluster.h" + + +/* ========================================================================= */ +/* -- Helper routines ------------------------------------------------------ */ +/* ========================================================================= */ + +static char +extract_single_character(PyObject* object, const char variable[], + const char allowed[]) +{ + Py_UCS4 ch; + Py_ssize_t n; + if (!PyUnicode_Check(object)) { + PyErr_Format(PyExc_ValueError, "%s should be a string", variable); + return 0; + } + if (PyUnicode_READY(object) == -1) return 0; + n = PyUnicode_GET_LENGTH(object); + if (n != 1) { + PyErr_Format(PyExc_ValueError, + "%s should be a single character", variable); + return 0; + } + ch = PyUnicode_READ_CHAR(object, 0); + if (ch < 128) { + const char c = ch; + if (strchr(allowed, c)) return c; + } + PyErr_Format(PyExc_ValueError, + "unknown %s function specified (should be one of '%s')", + variable, allowed); + return 0; +} + +static int +distance_converter(PyObject* object, void* pointer) +{ + char c; + + c = extract_single_character(object, "dist", "ebcauxsk"); + if (c == 0) return 0; + *((char*)pointer) = c; + return 1; +} + +static int +method_treecluster_converter(PyObject* object, void* pointer) +{ + char c; + + c = extract_single_character(object, "method", "csma"); + if (c == 0) return 0; + *((char*)pointer) = c; + return 1; +} + +static int +method_kcluster_converter(PyObject* object, void* pointer) +{ + char c; + + c = extract_single_character(object, "method", "am"); + if (c == 0) return 0; + *((char*)pointer) = c; + return 1; +} + +static int +method_clusterdistance_converter(PyObject* object, void* pointer) +{ + char c; + + c = extract_single_character(object, "method", "amsxv"); + if (c == 0) return 0; + *((char*)pointer) = c; + return 1; +} + +/* -- data ----------------------------------------------------------------- */ + +typedef struct { + int nrows; + int ncols; + double** values; + Py_buffer view; +} Data; + +static int +data_converter(PyObject* object, void* pointer) +{ + Data* data = pointer; + int nrows; + int ncols; + int i; + double** values = data->values; + Py_buffer* view = &data->view; + const char* p; + Py_ssize_t stride; + const int flag = PyBUF_ND | PyBUF_STRIDES; + + if (object == NULL) goto exit; + if (object == Py_None) return 1; + + if (PyObject_GetBuffer(object, view, flag) == -1) { + PyErr_SetString(PyExc_RuntimeError, + "data matrix has unexpected format."); + return 0; + } + + if (view->ndim != 2) { + PyErr_Format(PyExc_RuntimeError, + "data matrix has incorrect rank %d (expected 2)", + view->ndim); + goto exit; + } + if (view->itemsize != sizeof(double)) { + PyErr_SetString(PyExc_RuntimeError, + "data matrix has incorrect data type"); + goto exit; + } + nrows = (int) view->shape[0]; + ncols = (int) view->shape[1]; + if (nrows != view->shape[0] || ncols != view->shape[1]) { + PyErr_Format(PyExc_ValueError, + "data matrix is too large (dimensions = %zd x %zd)", + view->shape[0], view->shape[1]); + goto exit; + } + if (nrows < 1 || ncols < 1) { + PyErr_SetString(PyExc_ValueError, "data matrix is empty"); + goto exit; + } + stride = view->strides[0]; + if (view->strides[1] != view->itemsize) { + PyErr_SetString(PyExc_RuntimeError, "data is not contiguous"); + goto exit; + } + values = PyMem_Malloc(nrows*sizeof(double*)); + if (!values) { + PyErr_NoMemory(); + goto exit; + } + for (i = 0, p = view->buf; i < nrows; i++, p += stride) + values[i] = (double*)p; + data->values = values; + data->nrows = nrows; + data->ncols = ncols; + return Py_CLEANUP_SUPPORTED; + +exit: + if (values) PyMem_Free(values); + PyBuffer_Release(view); + return 0; +} + +/* -- mask ----------------------------------------------------------------- */ + +typedef struct { + int** values; + Py_buffer view; +} Mask; + +static int +mask_converter(PyObject* object, void* pointer) +{ + Mask* mask = pointer; + int nrows; + int ncols; + int i; + int** values = mask->values; + Py_buffer* view = &mask->view; + const char* p; + Py_ssize_t stride; + const int flag = PyBUF_ND | PyBUF_STRIDES; + + if (object == NULL) goto exit; + if (object == Py_None) return 1; + + if (PyObject_GetBuffer(object, view, flag) == -1) { + PyErr_SetString(PyExc_RuntimeError, "mask has unexpected format."); + return 0; + } + + if (view->ndim != 2) { + PyErr_Format(PyExc_ValueError, + "mask has incorrect rank %d (expected 2)", view->ndim); + goto exit; + } + if (view->itemsize != sizeof(int)) { + PyErr_SetString(PyExc_RuntimeError, "mask has incorrect data type"); + goto exit; + } + nrows = (int) view->shape[0]; + ncols = (int) view->shape[1]; + if (nrows != view->shape[0] || ncols != view->shape[1]) { + PyErr_Format(PyExc_ValueError, + "mask is too large (dimensions = %zd x %zd)", + view->shape[0], view->shape[1]); + goto exit; + } + stride = view->strides[0]; + if (view->strides[1] != view->itemsize) { + PyErr_SetString(PyExc_RuntimeError, "mask is not contiguous"); + goto exit; + } + values = PyMem_Malloc(nrows*sizeof(int*)); + if (!values) { + PyErr_NoMemory(); + goto exit; + } + for (i = 0, p = view->buf; i < nrows; i++, p += stride) + values[i] = (int*)p; + mask->values = values; + return Py_CLEANUP_SUPPORTED; + +exit: + if (values) PyMem_Free(values); + PyBuffer_Release(view); + return 0; +} + +/* -- 1d array ------------------------------------------------------------- */ + +static int +vector_converter(PyObject* object, void* pointer) +{ + Py_buffer* view = pointer; + int ndata; + const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; + + if (object == NULL) goto exit; + + if (PyObject_GetBuffer(object, view, flag) == -1) { + PyErr_SetString(PyExc_RuntimeError, "unexpected format."); + return 0; + } + + if (view->ndim != 1) { + PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 1)", + view->ndim); + goto exit; + } + if (view->itemsize != sizeof(double)) { + PyErr_SetString(PyExc_RuntimeError, "array has incorrect data type"); + goto exit; + } + ndata = (int) view->shape[0]; + if (ndata != view->shape[0]) { + PyErr_Format(PyExc_ValueError, + "array is too large (size = %zd)", view->shape[0]); + goto exit; + } + return Py_CLEANUP_SUPPORTED; + +exit: + PyBuffer_Release(view); + return 0; +} + +static int +vector_none_converter(PyObject* object, void* pointer) +{ + if (object == Py_None) return 1; + return vector_converter(object, pointer); +} + +/* -- clusterid ------------------------------------------------------------ */ + +static int +check_clusterid(Py_buffer clusterid, int nitems) { + int i, j; + int *p = clusterid.buf; + int nclusters = 0; + int* number; + + if (nitems != clusterid.shape[0]) { + PyErr_Format(PyExc_ValueError, "incorrect size (%zd, expected %d)", + clusterid.shape[0], nitems); + return 0; + } + for (i = 0; i < nitems; i++) { + j = p[i]; + if (j > nclusters) nclusters = j; + if (j < 0) { + PyErr_SetString(PyExc_ValueError, "negative cluster number found"); + return 0; + } + } + nclusters++; + /* -- Count the number of items in each cluster --------------------- */ + number = calloc(nclusters, sizeof(int)); + if (!number) { + PyErr_NoMemory(); + return 0; + } + for (i = 0; i < nitems; i++) { + j = p[i]; + number[j]++; + } + for (j = 0; j < nclusters; j++) if (number[j] == 0) break; + PyMem_Free(number); + if (j < nclusters) { + PyErr_Format(PyExc_ValueError, "cluster %d is empty", j); + return 0; + } + return nclusters; +} + +/* -- distance ----------------------------------------------------------- */ + +typedef struct { + int n; + double** values; + Py_buffer* views; + Py_buffer view; +} Distancematrix; + +static int +_convert_list_to_distancematrix(PyObject* list, Distancematrix* distances) +{ + int i; + double** values; + Py_buffer* view; + Py_buffer* views; + const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; + const int n = (int) PyList_GET_SIZE(list); + + if (n != PyList_GET_SIZE(list)) { + PyErr_SetString(PyExc_ValueError, "distance matrix is too large"); + return 0; + } + values = PyMem_Malloc(n*sizeof(double*)); + if (!values) { + PyErr_NoMemory(); + return 0; + } + distances->values = values; + views = PyMem_Malloc(n*sizeof(Py_buffer)); + if (!views) { + PyErr_NoMemory(); + return 0; + } + view = views; + for (i = 0; i < n; i++, view++) { + PyObject* item = PyList_GET_ITEM(list, i); + view->len = -1; + if (PyObject_GetBuffer(item, view, flag) == -1) { + PyErr_Format(PyExc_RuntimeError, "failed to parse row %d.", i); + view--; + break; + } + if (view->ndim != 1) { + PyErr_Format(PyExc_ValueError, + "row %d has incorrect rank (%d expected 1)", + i, view->ndim); + break; + } + if (view->itemsize != sizeof(double)) { + PyErr_Format(PyExc_RuntimeError, + "row %d has incorrect data type", i); + break; + } + if (view->shape[0] != i) { + PyErr_Format(PyExc_RuntimeError, + "row %d has incorrect size %zd (expected %d)", + i, view->shape[0], i); + break; + } + values[i] = view->buf; + } + if (i < n) { + for ( ; view >= views; view--) PyBuffer_Release(view); + PyMem_Free(views); + return 0; + } + distances->n = n; + distances->view.len = 0; + distances->views = views; + distances->values = values; + return 1; +} + +static int +_convert_array_to_distancematrix(PyObject* array, Distancematrix* distances) +{ + int i; + int n; + double** values; + double* p; + Py_buffer* view = &distances->view; + const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; + + if (PyObject_GetBuffer(array, view, flag) == -1) { + PyErr_SetString(PyExc_RuntimeError, + "distance matrix has unexpected format."); + return 0; + } + + if (view->len == 0) { + PyBuffer_Release(view); + PyErr_SetString(PyExc_ValueError, "distance matrix is empty"); + return 0; + } + if (view->itemsize != sizeof(double)) { + PyErr_SetString(PyExc_RuntimeError, + "distance matrix has an incorrect data type"); + return 0; + } + if (view->ndim == 1) { + int m = (int) view->shape[0]; + if (m != view->shape[0]) { + PyErr_Format(PyExc_ValueError, + "distance matrix is too large (size = %zd)", + view->shape[0]); + return 0; + } + n = (int)(1+sqrt(1+8*m)/2); /* rounds to (1+sqrt(1+8*m))/2 */ + if (n*n-n != 2 * m) { + PyErr_SetString(PyExc_ValueError, + "distance matrix has unexpected size."); + return 0; + } + distances->n = n; + values = PyMem_Malloc(n*sizeof(double*)); + if (!values) { + PyErr_NoMemory(); + return 0; + } + distances->values = values; + for (p = view->buf, i = 0; i < n; p += i, i++) values[i] = p; + } + else if (view->ndim == 2) { + n = (int) view->shape[0]; + if (n != view->shape[0]) { + PyErr_Format(PyExc_ValueError, + "distance matrix is too large (size = %zd)", + view->shape[0]); + return 0; + } + distances->n = n; + if (view->shape[1] != n) { + PyErr_SetString(PyExc_ValueError, + "distance matrix is not square."); + return 0; + } + values = PyMem_Malloc(n*sizeof(double*)); + if (!values) { + PyErr_NoMemory(); + return 0; + } + distances->values = values; + for (p = view->buf, i = 0; i < n; p += n, i++) values[i] = p; + } + else { + PyErr_Format(PyExc_ValueError, + "distance matrix has incorrect rank %d (expected 1 or 2)", + view->ndim); + return 0; + } + return 1; +} + +static int +distancematrix_converter(PyObject* argument, void* pointer) +{ + Distancematrix* distances = pointer; + double** values; + + if (argument == NULL) goto exit; + if (argument == Py_None) return 1; + if (PyList_Check(argument)) { + if (_convert_list_to_distancematrix(argument, distances)) + return Py_CLEANUP_SUPPORTED; + } + else { + if (_convert_array_to_distancematrix(argument, distances)) + return Py_CLEANUP_SUPPORTED; + } + +exit: + values = distances->values; + if (values == NULL) return 0; + else { + int i; + const int n = distances->n; + Py_buffer* views = distances->views; + if (views) { + for (i = 0; i < n; i++) PyBuffer_Release(&views[i]); + PyMem_Free(views); + } + else if (distances->view.len) { + PyBuffer_Release(&distances->view); + } + PyMem_Free(values); + } + return 0; +} + +/* -- celldata ------------------------------------------------------------- */ + +typedef struct { + int nx; + int ny; + int nz; + double*** values; + Py_buffer view; +} Celldata; + +static int +celldata_converter(PyObject* argument, void* pointer) +{ + int i, n; + double* p; + Celldata* celldata = pointer; + double*** ppp = celldata->values; + double** pp = ppp ? ppp[0] : NULL; + int nx; + int ny; + int nz; + Py_buffer* view = &celldata->view; + const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; + + if (argument == NULL) goto exit; + + if (PyObject_GetBuffer(argument, view, flag) == -1) { + PyErr_SetString(PyExc_RuntimeError, + "celldata array has unexpected format."); + return 0; + } + + nx = (int) view->shape[0]; + ny = (int) view->shape[1]; + nz = (int) view->shape[2]; + if (nx != view->shape[0] || ny != view->shape[1] || nz != view->shape[2]) { + PyErr_SetString(PyExc_RuntimeError, "celldata array too large"); + goto exit; + } + if (view->itemsize != sizeof(double)) { + PyErr_SetString(PyExc_RuntimeError, + "celldata array has incorrect data type"); + goto exit; + } + pp = PyMem_Malloc(nx*ny*sizeof(double*)); + ppp = PyMem_Malloc(nx*sizeof(double**)); + if (!pp || !ppp) { + PyErr_NoMemory(); + goto exit; + } + p = view->buf; + n = nx * ny; + for (i = 0; i < n; i++, p += nz) pp[i] = p; + for (i = 0; i < nx; i++, pp += ny) ppp[i] = pp; + celldata->values = ppp; + celldata->nx = nx; + celldata->ny = ny; + celldata->nz = nz; + return Py_CLEANUP_SUPPORTED; + +exit: + if (pp) PyMem_Free(pp); + if (ppp) PyMem_Free(ppp); + PyBuffer_Release(view); + return 0; +} + + +/* -- index ---------------------------------------------------------------- */ + +static int +index_converter(PyObject* argument, void* pointer) +{ + Py_buffer* view = pointer; + int n; + const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; + + if (argument == NULL) goto exit; + + if (PyObject_GetBuffer(argument, view, flag) == -1) { + PyErr_SetString(PyExc_RuntimeError, "unexpected format."); + return 0; + } + + if (view->ndim != 1) { + PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 1)", + view->ndim); + goto exit; + } + if (view->itemsize != sizeof(int)) { + PyErr_SetString(PyExc_RuntimeError, + "argument has incorrect data type"); + goto exit; + } + n = (int) view->shape[0]; + if (n != view->shape[0]) { + PyErr_Format(PyExc_ValueError, + "array size is too large (size = %zd)", view->shape[0]); + goto exit; + } + return Py_CLEANUP_SUPPORTED; + +exit: + PyBuffer_Release(view); + return 0; +} + +/* -- index2d ------------------------------------------------------------- */ + +static int +index2d_converter(PyObject* argument, void* pointer) +{ + Py_buffer* view = pointer; + int n; + const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; + + if (argument == NULL) goto exit; + + if (PyObject_GetBuffer(argument, view, flag) == -1) { + PyErr_SetString(PyExc_RuntimeError, "unexpected format."); + return 0; + } + + if (view->ndim != 2) { + PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 2)", + view->ndim); + goto exit; + } + if (view->itemsize != sizeof(int)) { + PyErr_SetString(PyExc_RuntimeError, + "argument has incorrect data type"); + goto exit; + } + n = (int) view->shape[0]; + if (n != view->shape[0]) { + PyErr_Format(PyExc_ValueError, + "array size is too large (size = %zd)", view->shape[0]); + goto exit; + } + if (view->shape[1] != 2) { + PyErr_Format(PyExc_ValueError, + "array has %zd columns (expected 2)", view->shape[1]); + goto exit; + } + return Py_CLEANUP_SUPPORTED; + +exit: + PyBuffer_Release(view); + return 0; +} + +/* ========================================================================= */ +/* -- Classes -------------------------------------------------------------- */ +/* ========================================================================= */ + +typedef struct { + PyObject_HEAD + Node node; +} PyNode; + +static int +PyNode_init(PyNode *self, PyObject *args, PyObject *kwds) +{ + int left, right; + double distance = 0.0; + static char *kwlist[] = {"left", "right", "distance", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|d", kwlist, + &left, &right, &distance)) + return -1; + self->node.left = left; + self->node.right = right; + self->node.distance = distance; + return 0; +} + +static PyObject* +PyNode_repr(PyNode* self) +{ + char string[64]; + + sprintf(string, "(%d, %d): %g", + self->node.left, self->node.right, self->node.distance); + return PyUnicode_FromString(string); +} + +static char PyNode_left__doc__[] = +"integer representing the first member of this node"; + +static PyObject* +PyNode_getleft(PyNode* self, void* closure) +{ + int left = self->node.left; + + return PyLong_FromLong((long)left); +} + +static int +PyNode_setleft(PyNode* self, PyObject* value, void* closure) +{ + long left = PyLong_AsLong(value); + + if (PyErr_Occurred()) return -1; + self->node.left = (int) left; + return 0; +} + +static char PyNode_right__doc__[] = +"integer representing the second member of this node"; + +static PyObject* +PyNode_getright(PyNode* self, void* closure) +{ + int right = self->node.right; + + return PyLong_FromLong((long)right); +} + +static int +PyNode_setright(PyNode* self, PyObject* value, void* closure) +{ + long right = PyLong_AsLong(value); + + if (PyErr_Occurred()) return -1; + self->node.right = (int) right; + return 0; +} + +static PyObject* +PyNode_getdistance(PyNode* self, void* closure) +{ + return PyFloat_FromDouble(self->node.distance); +} + +static int +PyNode_setdistance(PyNode* self, PyObject* value, void* closure) +{ + const double distance = PyFloat_AsDouble(value); + + if (PyErr_Occurred()) return -1; + self->node.distance = distance; + return 0; +} + +static char PyNode_distance__doc__[] = +"the distance between the two members of this node\n"; + +static PyGetSetDef PyNode_getset[] = { + {"left", + (getter)PyNode_getleft, + (setter)PyNode_setleft, + PyNode_left__doc__, NULL}, + {"right", + (getter)PyNode_getright, + (setter)PyNode_setright, + PyNode_right__doc__, NULL}, + {"distance", + (getter)PyNode_getdistance, + (setter)PyNode_setdistance, + PyNode_distance__doc__, NULL}, + {NULL} /* Sentinel */ +}; + +static char PyNode_doc[] = +"A Node object describes a single node in a hierarchical clustering tree.\n" +"The integer attributes 'left' and 'right' represent the two members that\n" +"make up this node; the floating point attribute 'distance' contains the\n" +"distance between the two members of this node.\n"; + +static PyTypeObject PyNodeType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_cluster.Node", /* tp_name */ + sizeof(PyNode), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + (reprfunc)PyNode_repr, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + PyNode_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + PyNode_getset, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)PyNode_init, /* tp_init */ +}; + +typedef struct { + PyObject_HEAD + Node* nodes; + int n; +} PyTree; + +static void +PyTree_dealloc(PyTree* self) +{ + if (self->n) PyMem_Free(self->nodes); + Py_TYPE(self)->tp_free((PyObject*)self); +} + +static PyObject* +PyTree_new(PyTypeObject *type, PyObject* args, PyObject* kwds) +{ + int i, j; + int n; + Node* nodes; + PyObject* arg = NULL; + int* flag; + PyTree* self; + + self = (PyTree *)type->tp_alloc(type, 0); + if (!self) return NULL; + + if (!PyArg_ParseTuple(args, "|O", &arg)) { + Py_DECREF(self); + return NULL; + } + + if (arg == NULL) { + self->n = 0; + self->nodes = NULL; + return (PyObject*)self; + } + + if (!PyList_Check(arg)) { + Py_DECREF(self); + PyErr_SetString(PyExc_TypeError, + "Argument should be a list of Node objects"); + return NULL; + } + + n = (int) PyList_GET_SIZE(arg); + if (n != PyList_GET_SIZE(arg)) { + Py_DECREF(self); + PyErr_Format(PyExc_ValueError, + "List is too large (size = %zd)", PyList_GET_SIZE(arg)); + return NULL; + } + if (n < 1) { + Py_DECREF(self); + PyErr_SetString(PyExc_ValueError, "List is empty"); + return NULL; + } + nodes = PyMem_Malloc(n*sizeof(Node)); + if (!nodes) { + Py_DECREF(self); + return PyErr_NoMemory(); + } + for (i = 0; i < n; i++) { + PyNode* p; + PyObject* row = PyList_GET_ITEM(arg, i); + if (!PyType_IsSubtype(Py_TYPE(row), &PyNodeType)) { + PyMem_Free(nodes); + Py_DECREF(self); + PyErr_Format(PyExc_TypeError, + "Row %d in list is not a Node object", i); + return NULL; + } + p = (PyNode*)row; + nodes[i] = p->node; + } + /* --- Check if this is a bona fide tree ------------------------------- */ + flag = PyMem_Malloc((2*n+1)*sizeof(int)); + if (!flag) { + PyMem_Free(nodes); + Py_DECREF(self); + return PyErr_NoMemory(); + } + for (i = 0; i < 2*n+1; i++) flag[i] = 0; + for (i = 0; i < n; i++) { + j = nodes[i].left; + if (j < 0) { + j = -j-1; + if (j >= i) break; + } + else j += n; + if (flag[j]) break; + flag[j] = 1; + j = nodes[i].right; + if (j < 0) { + j = -j-1; + if (j >= i) break; + } + else j += n; + if (flag[j]) break; + flag[j] = 1; + } + PyMem_Free(flag); + if (i < n) { + /* break encountered */ + PyMem_Free(nodes); + Py_DECREF(self); + PyErr_SetString(PyExc_ValueError, "Inconsistent tree"); + return NULL; + } + self->n = n; + self->nodes = nodes; + return (PyObject*)self; +} + +static PyObject* +PyTree_str(PyTree* self) +{ + int i; + const int n = self->n; + char string[128]; + Node node; + PyObject* line; + PyObject* output; + PyObject* temp; + + output = PyUnicode_FromString(""); + for (i = 0; i < n; i++) { + node = self->nodes[i]; + sprintf(string, "(%d, %d): %g", node.left, node.right, node.distance); + if (i < n-1) strcat(string, "\n"); + line = PyUnicode_FromString(string); + if (!line) { + Py_DECREF(output); + return NULL; + } + temp = PyUnicode_Concat(output, line); + if (!temp) { + Py_DECREF(output); + Py_DECREF(line); + return NULL; + } + output = temp; + } + return output; +} + +static int +PyTree_length(PyTree *self) +{ + return self->n; +} + +static PyObject* +PyTree_subscript(PyTree* self, PyObject* item) +{ + if (PyIndex_Check(item)) { + PyNode* result; + Py_ssize_t i; + i = PyNumber_AsSsize_t(item, PyExc_IndexError); + if (i == -1 && PyErr_Occurred()) + return NULL; + if (i < 0) + i += self->n; + if (i < 0 || i >= self->n) { + PyErr_SetString(PyExc_IndexError, "tree index out of range"); + return NULL; + } + result = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0); + if (!result) return PyErr_NoMemory(); + result->node = self->nodes[i]; + return (PyObject*) result; + } + else if (PySlice_Check(item)) { + Py_ssize_t i, j; + Py_ssize_t start, stop, step, slicelength; + if (PySlice_GetIndicesEx(item, self->n, &start, &stop, &step, + &slicelength) == -1) return NULL; + if (slicelength == 0) return PyList_New(0); + else { + PyNode* node; + PyObject* result = PyList_New(slicelength); + if (!result) return PyErr_NoMemory(); + for (i = 0, j = start; i < slicelength; i++, j += step) { + node = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0); + if (!node) { + Py_DECREF(result); + return PyErr_NoMemory(); + } + node->node = self->nodes[j]; + PyList_SET_ITEM(result, i, (PyObject*)node); + } + return result; + } + } + else { + PyErr_Format(PyExc_TypeError, + "tree indices must be integers, not %.200s", + item->ob_type->tp_name); + return NULL; + } +} + +static PyMappingMethods PyTree_mapping = { + (lenfunc)PyTree_length, /* mp_length */ + (binaryfunc)PyTree_subscript, /* mp_subscript */ +}; + +static char PyTree_scale__doc__[] = +"mytree.scale()\n" +"\n" +"Scale the node distances in the tree such that they are all between one\n" +"and zero.\n"; + +static PyObject* +PyTree_scale(PyTree* self) +{ + int i; + const int n = self->n; + Node* nodes = self->nodes; + double maximum = DBL_MIN; + + for (i = 0; i < n; i++) { + double distance = nodes[i].distance; + if (distance > maximum) maximum = distance; + } + if (maximum != 0.0) + for (i = 0; i < n; i++) nodes[i].distance /= maximum; + Py_INCREF(Py_None); + return Py_None; +} + +static char PyTree_cut__doc__[] = +"mytree.cut(nclusters) -> array\n" +"\n" +"Divide the elements in a hierarchical clustering result mytree into\n" +"clusters, and return an array with the number of the cluster to which each\n" +"element was assigned. The number of clusters is given by nclusters.\n"; + +static PyObject* +PyTree_cut(PyTree* self, PyObject* args) +{ + int ok = -1; + int nclusters; + const int n = self->n + 1; + Py_buffer indices = {0}; + + if (!PyArg_ParseTuple(args, "O&i", + index_converter, &indices, &nclusters)) goto exit; + if (nclusters < 1) { + PyErr_SetString(PyExc_ValueError, + "requested number of clusters should be positive"); + goto exit; + } + if (nclusters > n) { + PyErr_SetString(PyExc_ValueError, + "more clusters requested than items available"); + goto exit; + } + if (indices.shape[0] != n) { + PyErr_SetString(PyExc_RuntimeError, + "indices array inconsistent with tree"); + goto exit; + } + ok = cuttree(n, self->nodes, nclusters, indices.buf); + +exit: + index_converter(NULL, &indices); + if (ok == -1) return NULL; + if (ok == 0) return PyErr_NoMemory(); + Py_INCREF(Py_None); + return Py_None; +} + +static char PyTree_sort__doc__[] = +"mytree.sort(order) -> array\n" +"\n" +"Sort a hierarchical clustering tree by switching the left and right\n" +"subnode of nodes such that the elements in the left-to-right order of the\n" +"tree tend to have increasing order values.\n" +"\n" +"Return the indices of the elements in the left-to-right order in the\n" +"hierarchical clustering tree, such that the element with index indices[i]\n" +"occurs at position i in the dendrogram.\n"; + +static PyObject* +PyTree_sort(PyTree* self, PyObject* args) +{ + int ok = -1; + Py_buffer indices = {0}; + const int n = self->n; + Py_buffer order = {0}; + + if (n == 0) { + PyErr_SetString(PyExc_ValueError, "tree is empty"); + return NULL; + } + if (!PyArg_ParseTuple(args, "O&O&", + index_converter, &indices, + vector_converter, &order)) goto exit; + if (indices.shape[0] != n + 1) { + PyErr_SetString(PyExc_RuntimeError, + "indices array inconsistent with tree"); + goto exit; + } + if (order.shape[0] != n + 1) { + PyErr_Format(PyExc_ValueError, + "order array has incorrect size %zd (expected %d)", + order.shape[0], n + 1); + goto exit; + } + ok = sorttree(n, self->nodes, order.buf, indices.buf); +exit: + index_converter(NULL, &indices); + vector_converter(NULL, &order); + if (ok == -1) return NULL; + if (ok == 0) return PyErr_NoMemory(); + Py_INCREF(Py_None); + return Py_None; +} + +static PyMethodDef PyTree_methods[] = { + {"scale", (PyCFunction)PyTree_scale, METH_NOARGS, PyTree_scale__doc__}, + {"cut", (PyCFunction)PyTree_cut, METH_VARARGS, PyTree_cut__doc__}, + {"sort", (PyCFunction)PyTree_sort, METH_VARARGS, PyTree_sort__doc__}, + {NULL} /* Sentinel */ +}; + +static char PyTree_doc[] = +"Tree objects store a hierarchical clustering solution.\n" +"Individual nodes in the tree can be accessed with tree[i], where i is\n" +"an integer. Whereas the tree itself is a read-only object, tree[:]\n" +"returns a list of all the nodes, which can then be modified. To create\n" +"a new Tree from this list, use Tree(list).\n" +"See the description of the Node class for more information."; + +static PyTypeObject PyTreeType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_cluster.Tree", /* tp_name */ + sizeof(PyTree), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)PyTree_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + &PyTree_mapping, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + (reprfunc)PyTree_str, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + PyTree_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + PyTree_methods, /* tp_methods */ + NULL, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + (newfunc)PyTree_new, /* tp_new */ +}; + +/* ========================================================================= */ +/* -- Methods -------------------------------------------------------------- */ +/* ========================================================================= */ + +/* version */ +static char version__doc__[] = +"version() -> string\n" +"\n" +"Return the version number of the C Clustering Library as a string.\n"; + +static PyObject* +py_version(PyObject* self) +{ + return PyUnicode_FromString( CLUSTERVERSION ); +} + +/* kcluster */ +static char kcluster__doc__[] = +"kcluster(data, nclusters, mask, weight, transpose, npass, method,\n" +" dist, clusterid) -> None\n" +"\n" +"This function implements k-means clustering.\n" +"\n" +"Arguments:\n" +"\n" +" - data: nrows x ncols array containing the data to be clustered\n" +"\n" +" - nclusters: number of clusters (the 'k' in k-means)\n" +"\n" +" - mask: nrows x ncols array of integers, showing which data are\n" +" missing. If mask[i,j] == 0, then data[i,j] is missing.\n" +"\n" +" - weight: the weights to be used when calculating distances\n" +" - transpose:\n" +"\n" +" - if equal to 0, rows are clustered;\n" +" - if equal to 1, columns are clustered.\n" +"\n" +" - npass: number of times the k-means clustering algorithm is\n" +" performed, each time with a different (random) initial\n" +" condition. If npass == 0, then the assignments in clusterid\n" +" are used as the initial condition.\n" +"\n" +" - method: specifies how the center of a cluster is found:\n" +"\n" +" - method == 'a': arithmetic mean\n" +" - method == 'm': median\n" +"\n" +" - dist: specifies the distance function to be used:\n" +"\n" +" - dist == 'e': Euclidean distance\n" +" - dist == 'b': City Block distance\n" +" - dist == 'c': Pearson correlation\n" +" - dist == 'a': absolute value of the correlation\n" +" - dist == 'u': uncentered correlation\n" +" - dist == 'x': absolute uncentered correlation\n" +" - dist == 's': Spearman's rank correlation\n" +" - dist == 'k': Kendall's tau\n" +"\n" +" - clusterid: array in which the final clustering solution will be\n" +" stored (output variable). If npass == 0, then clusterid is also used\n" +" as an input variable, containing the initial condition from which\n" +" the EM algorithm should start. In this case, the k-means algorithm\n" +" is fully deterministic.\n" +"\n"; + +static PyObject* +py_kcluster(PyObject* self, PyObject* args, PyObject* keywords) +{ + int nclusters = 2; + int nrows, ncols; + int nitems; + int ndata; + Data data = {0}; + Mask mask = {0}; + Py_buffer weight = {0}; + int transpose = 0; + int npass = 1; + char method = 'a'; + char dist = 'e'; + Py_buffer clusterid = {0}; + double error; + int ifound = 0; + + static char* kwlist[] = {"data", + "nclusters", + "mask", + "weight", + "transpose", + "npass", + "method", + "dist", + "clusterid", + NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iO&O&iiO&O&O&", kwlist, + data_converter, &data, + &nclusters, + mask_converter, &mask, + vector_converter, &weight, + &transpose, + &npass, + method_kcluster_converter, &method, + distance_converter, &dist, + index_converter, &clusterid)) return NULL; + if (!data.values) { + PyErr_SetString(PyExc_RuntimeError, "data is None"); + goto exit; + } + if (!mask.values) { + PyErr_SetString(PyExc_RuntimeError, "mask is None"); + goto exit; + } + if (data.nrows != mask.view.shape[0] || + data.ncols != mask.view.shape[1]) { + PyErr_Format(PyExc_ValueError, + "mask has incorrect dimensions %zd x %zd (expected %d x %d)", + mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); + goto exit; + } + nrows = data.nrows; + ncols = data.ncols; + ndata = transpose ? nrows : ncols; + nitems = transpose ? ncols : nrows; + if (weight.shape[0] != ndata) { + PyErr_Format(PyExc_ValueError, + "weight has incorrect size %zd (expected %d)", + weight.shape[0], ndata); + goto exit; + } + if (nclusters < 1) { + PyErr_SetString(PyExc_ValueError, "nclusters should be positive"); + goto exit; + } + if (nitems < nclusters) { + PyErr_SetString(PyExc_ValueError, + "more clusters than items to be clustered"); + goto exit; + } + if (npass < 0) { + PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer"); + goto exit; + } + else if (npass == 0) { + int n = check_clusterid(clusterid, nitems); + if (n == 0) goto exit; + if (n != nclusters) { + PyErr_SetString(PyExc_ValueError, + "more clusters requested than found in clusterid"); + goto exit; + } + } + kcluster(nclusters, + nrows, + ncols, + data.values, + mask.values, + weight.buf, + transpose, + npass, + method, + dist, + clusterid.buf, + &error, + &ifound); +exit: + data_converter(NULL, &data); + mask_converter(NULL, &mask); + vector_converter(NULL, &weight); + index_converter(NULL, &clusterid); + if (ifound) return Py_BuildValue("di", error, ifound); + return NULL; +} +/* end of wrapper for kcluster */ + +/* kmedoids */ +static char kmedoids__doc__[] = +"kmedoids(distance, nclusters, npass, clusterid) -> error, nfound\n" +"\n" +"This function implements k-medoids clustering.\n" +"\n" +"Arguments:\n" +" - distance: The distance matrix between the elements. There are three\n" +" ways in which you can pass a distance matrix:\n" +"\n" +" 1. a 2D Numerical Python array (in which only the left-lower\n" +" part of the array will be accessed);\n" +" 2. a 1D Numerical Python array containing the distances\n" +" consecutively;\n" +" 3. a list of rows containing the lower-triangular part of\n" +" the distance matrix.\n" +"\n" +" Examples are:\n" +"\n" +" >>> from numpy import array\n" +" >>> distance = array([[0.0, 1.1, 2.3],\n" +" ... [1.1, 0.0, 4.5],\n" +" ... [2.3, 4.5, 0.0]])\n" +" >>> # (option #1)\n" +" >>> distance = array([1.1, 2.3, 4.5])\n" +" >>> # (option #2)\n" +" >>> distance = [array([]),\n" +" ... array([1.1]),\n" +" ... array([2.3, 4.5])]\n" +" >>> # (option #3)\n" +"\n" +" These three correspond to the same distance matrix.\n" +"\n" +" - nclusters: number of clusters (the 'k' in k-medoids)\n" +"\n" +" - npass: number of times the k-medoids clustering algorithm is\n" +" performed, each time with a different (random) initial\n" +" condition. If npass == 0, then the assignments in clusterid\n" +" are used as the initial condition.\n" +"\n" +" - clusterid: array in which the final clustering solution will be\n" +" stored (output variable). If npass == 0, then clusterid is also used\n" +" as an input variable, containing the initial condition from which\n" +" the EM algorithm should start. In this case, the k-medoids algorithm\n" +" is fully deterministic.\n" +"\n" +"Return values:\n" +" - error: the within-cluster sum of distances for the returned k-means\n" +" clustering solution;\n" +" - nfound: the number of times this solution was found.\n"; + +static PyObject* +py_kmedoids(PyObject* self, PyObject* args, PyObject* keywords) +{ + int nclusters = 2; + Distancematrix distances = {0}; + Py_buffer clusterid = {0}; + int npass = 1; + double error; + int ifound = -2; + + static char* kwlist[] = {"distance", + "nclusters", + "npass", + "clusterid", + NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iiO&", kwlist, + distancematrix_converter, &distances, + &nclusters, + &npass, + index_converter, &clusterid)) return NULL; + if (npass < 0) { + PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer"); + goto exit; + } + else if (npass == 0) { + int n = check_clusterid(clusterid, distances.n); + if (n == 0) goto exit; + if (n != nclusters) { + PyErr_SetString(PyExc_RuntimeError, + "more clusters requested than found in clusterid"); + goto exit; + } + } + if (nclusters <= 0) { + PyErr_SetString(PyExc_ValueError, + "nclusters should be a positive integer"); + goto exit; + } + if (distances.n < nclusters) { + PyErr_SetString(PyExc_ValueError, + "more clusters requested than items to be clustered"); + goto exit; + } + kmedoids(nclusters, + distances.n, + distances.values, + npass, + clusterid.buf, + &error, + &ifound); + +exit: + distancematrix_converter(NULL, &distances); + index_converter(NULL, &clusterid); + switch (ifound) { + case -2: + return NULL; + case -1: + return PyErr_NoMemory(); + case 0: /* should not occur */ + PyErr_SetString(PyExc_RuntimeError, + "error in kmedoids input arguments"); + return NULL; + default: + return Py_BuildValue("di", error, ifound); + } +} +/* end of wrapper for kmedoids */ + +/* treecluster */ +static char treecluster__doc__[] = +"treecluster(tree, data, mask, weight, transpose, dist, method,\n" +" distancematrix) -> None\n" +"\n" +"This function implements the pairwise single, complete, centroid, and\n" +"average linkage hierarchical clustering methods.\n" +"\n" +"Arguments:\n" +" - tree: an empty Tree object; its nodes will be filled by treecluster\n" +" to describe the hierarchical clustering result. See the description\n" +" of the Tree class for more information.\n" +"\n" +" - data: nrows x ncols array containing the data to be clustered.\n" +" Either data or distancematrix (see below) should be None.\n" +"\n" +" - mask: nrows x ncols array of integers, showing which data are\n" +" missing. If mask[i,j]==0, then data[i,j] is missing.\n" +"\n" +" - weight: the weights to be used when calculating distances.\n" +"\n" +" - transpose:\n" +"\n" +" - if equal to 0, rows are clustered;\n" +" - if equal to 1, columns are clustered.\n" +"\n" +" - dist: specifies the distance function to be used:\n" +"\n" +" - dist == 'e': Euclidean distance\n" +" - dist == 'b': City Block distance\n" +" - dist == 'c': Pearson correlation\n" +" - dist == 'a': absolute value of the correlation\n" +" - dist == 'u': uncentered correlation\n" +" - dist == 'x': absolute uncentered correlation\n" +" - dist == 's': Spearman's rank correlation\n" +" - dist == 'k': Kendall's tau\n" +"\n" +" - method: specifies which linkage method is used:\n" +"\n" +" - method == 's': Single pairwise linkage\n" +" - method == 'm': Complete (maximum) pairwise linkage (default)\n" +" - method == 'c': Centroid linkage\n" +" - method == 'a': Average pairwise linkage\n" +"\n" +" - distancematrix: The distance matrix between the elements.\n" +" Either data (see above) or distancematrix should be None.\n" +" There are three ways in which you can pass a distance matrix:\n" +"\n" +" 1. a 2D Numerical Python array (in which only the left-lower\n" +" part of the array will be accessed);\n" +" 2. a 1D Numerical Python array containing the distances\n" +" consecutively;\n" +" 3. a list of rows containing the lower-triangular part of\n" +" the distance matrix.\n" +"\n" +" Examples are:\n" +"\n" +" >>> from numpy import array\n" +" >>> distance = array([[0.0, 1.1, 2.3],\n" +" ... [1.1, 0.0, 4.5],\n" +" ... [2.3, 4.5, 0.0]])\n" +" >>> # option 1.\n" +" >>> distance = array([1.1, 2.3, 4.5])\n" +" >>> # option 2.\n" +" >>> distance = [array([]),\n" +" ... array([1.1]),\n" +" ... array([2.3, 4.5])]\n" +" >>> # option 3.\n" +"\n" +" These three correspond to the same distance matrix.\n" +"\n" +" PLEASE NOTE:\n" +" As the treecluster routine may shuffle the values in the\n" +" distance matrix as part of the clustering algorithm, be sure\n" +" to save this array in a different variable before calling\n" +" treecluster if you need it later.\n" +"\n" +"Either data or distancematrix should be None. If distancematrix is None,\n" +"the hierarchical clustering solution is calculated from the values in\n" +"the argument data. Instead if data is None, the hierarchical clustering\n" +"solution is calculated from the distance matrix.\n" +"Pairwise centroid-linkage clustering can be calculated only from the data\n" +"and not from the distance matrix.\n" +"Pairwise single-, maximum-, and average-linkage clustering can be\n" +"calculated from either the data or from the distance matrix.\n"; + +static PyObject* +py_treecluster(PyObject* self, PyObject* args, PyObject* keywords) +{ + Data data = {0}; + Mask mask = {0}; + Py_buffer weight = {0}; + int transpose = 0; + char dist = 'e'; + char method = 'm'; + Distancematrix distances = {0}; + PyTree* tree = NULL; + Node* nodes; + int nitems; + + static char* kwlist[] = {"tree", + "data", + "mask", + "weight", + "transpose", + "method", + "dist", + "distancematrix", + NULL }; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, "O!O&O&O&iO&O&O&", kwlist, + &PyTreeType, &tree, + data_converter, &data, + mask_converter, &mask, + vector_none_converter, &weight, + &transpose, + method_treecluster_converter, &method, + distance_converter, &dist, + distancematrix_converter, &distances)) + return NULL; + + if (tree->n != 0) { + PyErr_SetString(PyExc_RuntimeError, "expected an empty tree"); + goto exit; + } + if (data.values != NULL && distances.values != NULL) { + PyErr_SetString(PyExc_ValueError, + "use either data or distancematrix, do not use both"); + goto exit; + } + if (data.values == NULL && distances.values == NULL) { + PyErr_SetString(PyExc_ValueError, + "neither data nor distancematrix was given"); + goto exit; + } + + if (data.values) /* use the values in data, not the distance matrix */ { + int nrows; + int ncols; + int ndata; + + if (!mask.values) { + PyErr_SetString(PyExc_RuntimeError, "mask is None"); + goto exit; + } + if (!weight.buf) { + PyErr_SetString(PyExc_RuntimeError, "weight is None"); + goto exit; + } + nrows = data.nrows; + ncols = data.ncols; + if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { + PyErr_Format(PyExc_ValueError, + "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", + mask.view.shape[0], mask.view.shape[1], + data.nrows, data.ncols); + goto exit; + } + ndata = transpose ? nrows : ncols; + nitems = transpose ? ncols : nrows; + if (weight.shape[0] != ndata) { + PyErr_Format(PyExc_RuntimeError, + "weight has incorrect size %zd (expected %d)", + weight.shape[0], ndata); + goto exit; + } + + nodes = treecluster(nrows, + ncols, + data.values, + mask.values, + weight.buf, + transpose, + dist, + method, + NULL); + } + else { /* use the distance matrix instead of the values in data */ + if (!strchr("sma", method)) { + PyErr_SetString(PyExc_ValueError, + "argument method should be 's', 'm', or 'a' " + "when specifying the distance matrix"); + goto exit; + } + nitems = distances.n; + nodes = treecluster(nitems, + nitems, + 0, + 0, + 0, + transpose, + dist, + method, + distances.values); + } + + if (!nodes) { + PyErr_NoMemory(); + goto exit; + } + tree->nodes = nodes; + tree->n = nitems-1; + +exit: + data_converter(NULL, &data); + mask_converter(NULL, &mask); + vector_none_converter(NULL, &weight); + distancematrix_converter(NULL, &distances); + if (tree == NULL || tree->n == 0) return NULL; + Py_INCREF(Py_None); + return Py_None; +} +/* end of wrapper for treecluster */ + +/* somcluster */ +static char somcluster__doc__[] = +"somcluster(clusterid, celldata, data, mask, weight, transpose,\n" +" inittau, niter, dist) -> None\n" +"\n" +"This function implements a self-organizing map on a rectangular grid.\n" +"\n" +"Arguments:\n" +" - clusterid: array with two columns, with the number of rows equal\n" +" to the number of items being clustered. Upon return, each row\n" +" in the array contains the x and y coordinates of the cell in the\n" +" the rectangular SOM grid to which the item was assigned.\n" +"\n" +" - celldata: array with dimensions nxgrid x nygrid x number of columns\n" +" if rows are being clustered, or nxgrid x nygrid x number of rows\n" +" if columns are being clustered, where nxgrid is the horizontal\n" +" dimension of the rectangular SOM map and nygrid is the vertical\n" +" dimension of the rectangular SOM map.\n" +" Upon return, each element [ix, iy] of this array contains the\n" +" data for the centroid of the cluster in the SOM grid cell with\n" +" coordinates [ix, iy].\n" +"\n" +" - data: nrows x ncols array containing the data to be clustered.\n" +"\n" +" - mask: nrows x ncols array of integers, showing which data are\n" +" missing. If mask[i,j] == 0, then data[i,j] is missing.\n" +"\n" +" - weight: the weights to be used when calculating distances\n" +"\n" +" - transpose:\n" +"\n" +" - if equal to 0, rows are clustered;\n" +" - if equal to 1, columns are clustered.\n" +"\n" +" - inittau: the initial value of tau (the neighborbood function)\n" +"\n" +" - niter: the number of iterations\n" +"\n" +" - dist: specifies the distance function to be used:\n" +"\n" +" - dist == 'e': Euclidean distance\n" +" - dist == 'b': City Block distance\n" +" - dist == 'c': Pearson correlation\n" +" - dist == 'a': absolute value of the correlation\n" +" - dist == 'u': uncentered correlation\n" +" - dist == 'x': absolute uncentered correlation\n" +" - dist == 's': Spearman's rank correlation\n" +" - dist == 'k': Kendall's tau\n"; + +static PyObject* +py_somcluster(PyObject* self, PyObject* args, PyObject* keywords) +{ + int nrows; + int ncols; + int ndata; + Data data = {0}; + Mask mask = {0}; + Py_buffer weight = {0}; + int transpose = 0; + double inittau = 0.02; + int niter = 1; + char dist = 'e'; + Py_buffer indices = {0}; + Celldata celldata = {0}; + PyObject* result = NULL; + + static char* kwlist[] = {"clusterids", + "celldata", + "data", + "mask", + "weight", + "transpose", + "inittau", + "niter", + "dist", + NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&idiO&", kwlist, + index2d_converter, &indices, + celldata_converter, &celldata, + data_converter, &data, + mask_converter, &mask, + vector_converter, &weight, + &transpose, + &inittau, + &niter, + distance_converter, &dist)) return NULL; + if (niter < 1) { + PyErr_SetString(PyExc_ValueError, + "number of iterations (niter) should be positive"); + goto exit; + } + if (!data.values) { + PyErr_SetString(PyExc_RuntimeError, "data is None"); + goto exit; + } + if (!mask.values) { + PyErr_SetString(PyExc_RuntimeError, "mask is None"); + goto exit; + } + nrows = data.nrows; + ncols = data.ncols; + if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { + PyErr_Format(PyExc_ValueError, + "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", + mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); + goto exit; + } + ndata = transpose ? nrows : ncols; + if (weight.shape[0] != ndata) { + PyErr_Format(PyExc_RuntimeError, + "weight has incorrect size %zd (expected %d)", + weight.shape[0], ndata); + goto exit; + } + if (celldata.nz != ndata) { + PyErr_Format(PyExc_RuntimeError, + "the celldata array size is not consistent with the data " + "(last dimension is %d; expected %d)", celldata.nz, ndata); + goto exit; + } + somcluster(nrows, + ncols, + data.values, + mask.values, + weight.buf, + transpose, + celldata.nx, + celldata.ny, + inittau, + niter, + dist, + celldata.values, + indices.buf); + Py_INCREF(Py_None); + result = Py_None; + +exit: + data_converter(NULL, &data); + vector_converter(NULL, &weight); + index2d_converter(NULL, &indices); + celldata_converter(NULL, &celldata); + return result; +} +/* end of wrapper for somcluster */ + +/* clusterdistance */ +static char clusterdistance__doc__[] = +"clusterdistance(data, mask, weight, index1, index2, dist, method,\n" +" transpose) -> distance between two clusters\n" +"\n" +"Arguments:\n" +"\n" +" - data: nrows x ncols array containing the data values.\n" +"\n" +" - mask: nrows x ncols array of integers, showing which data are\n" +" missing. If mask[i,j] == 0, then data[i,j] is missing.\n" +"\n" +" - weight: the weights to be used when calculating distances\n" +"\n" +" - index1: 1D array identifying which items belong to the first\n" +" cluster.\n" +"\n" +" - index2: 1D array identifying which items belong to the second\n" +" cluster.\n" +"\n" +" - dist: specifies the distance function to be used:\n" +"\n" +" - dist == 'e': Euclidean distance\n" +" - dist == 'b': City Block distance\n" +" - dist == 'c': Pearson correlation\n" +" - dist == 'a': absolute value of the correlation\n" +" - dist == 'u': uncentered correlation\n" +" - dist == 'x': absolute uncentered correlation\n" +" - dist == 's': Spearman's rank correlation\n" +" - dist == 'k': Kendall's tau\n" +"\n" +" - method: specifies how the distance between two clusters is defined:\n" +"\n" +" - method == 'a': the distance between the arithmetic means of the\n" +" two clusters\n" +" - method == 'm': the distance between the medians of the two\n" +" clusters\n" +" - method == 's': the smallest pairwise distance between members\n" +" of the two clusters\n" +" - method == 'x': the largest pairwise distance between members of\n" +" the two clusters\n" +" - method == 'v': average of the pairwise distances between\n" +" members of the clusters\n" +"\n" +" - transpose:\n" +"\n" +" - if equal to 0: clusters of rows are considered;\n" +" - if equal to 1: clusters of columns are considered.\n" +"\n"; + +static PyObject* +py_clusterdistance(PyObject* self, PyObject* args, PyObject* keywords) +{ + double distance; + int nrows; + int ncols; + int ndata; + Data data = {0}; + Mask mask = {0}; + Py_buffer weight = {0}; + char dist = 'e'; + char method = 'a'; + int transpose = 0; + Py_buffer index1 = {0}; + Py_buffer index2 = {0}; + PyObject* result = NULL; + + static char* kwlist[] = {"data", + "mask", + "weight", + "index1", + "index2", + "method", + "dist", + "transpose", + NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&O&O&i", kwlist, + data_converter, &data, + mask_converter, &mask, + vector_converter, &weight, + index_converter, &index1, + index_converter, &index2, + method_clusterdistance_converter, &method, + distance_converter, &dist, + &transpose)) return NULL; + if (!data.values) { + PyErr_SetString(PyExc_RuntimeError, "data is None"); + goto exit; + } + if (!mask.values) { + PyErr_SetString(PyExc_RuntimeError, "mask is None"); + goto exit; + } + nrows = data.nrows; + ncols = data.ncols; + ndata = transpose ? nrows : ncols; + if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { + PyErr_Format(PyExc_ValueError, + "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", + mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); + goto exit; + } + if (weight.shape[0] != ndata) { + PyErr_Format(PyExc_RuntimeError, + "weight has incorrect size %zd (expected %d)", + weight.shape[0], ndata); + goto exit; + } + + distance = clusterdistance(nrows, + ncols, + data.values, + mask.values, + weight.buf, + (int) index1.shape[0], + (int) index2.shape[0], + index1.buf, + index2.buf, + dist, + method, + transpose); + + if (distance < -0.5) /* Actually -1.0; avoiding roundoff errors */ + PyErr_SetString(PyExc_IndexError, "index out of range"); + else + result = PyFloat_FromDouble(distance); +exit: + data_converter(NULL, &data); + mask_converter(NULL, &mask); + vector_converter(NULL, &weight); + index_converter(NULL, &index1); + index_converter(NULL, &index2); + return result; +} +/* end of wrapper for clusterdistance */ + +/* clustercentroids */ +static char clustercentroids__doc__[] = +"clustercentroids(data, mask, clusterid, method, transpose) -> cdata, cmask\n" +"\n" +"The clustercentroids routine calculates the cluster centroids, given to\n" +"which cluster each element belongs. The centroid is defined as either\n" +"the mean or the median over all elements for each dimension.\n" +"\n" +"Arguments:\n" +" - data: nrows x ncols array containing the data values.\n" +"\n" +" - mask: nrows x ncols array of integers, showing which data are\n" +" missing. If mask[i,j] == 0, then data[i,j] is missing.\n" +"\n" +" - clusterid: array containing the cluster number for each item.\n" +" The cluster number should be non-negative.\n" +"\n" +" - method: specifies whether the centroid is calculated from the\n" +" arithmetic mean (method == 'a', default) or the median\n" +" (method == 'm') over each dimension.\n" +"\n" +" - transpose: if equal to 0, row clusters are considered;\n" +" if equal to 1, column clusters are considered.\n" +"\n" +" - cdata: 2D array containing, upon return, the cluster centroids.\n" +" If transpose == 0, then the dimensions of cdata should be\n" +" nclusters x ncols.\n" +" If transpose == 1, then the dimensions of cdata should be \n" +" nrows x nclusters.\n" +"\n" +" - cmask: 2D array of integers describing, upon return, which elements\n" +" in cdata, if any, are missing.\n"; + +static PyObject* +py_clustercentroids(PyObject* self, PyObject* args, PyObject* keywords) +{ + int nrows; + int ncols; + int nclusters; + Data data = {0}; + Mask mask = {0}; + Data cdata = {0}; + Mask cmask = {0}; + Py_buffer clusterid = {0}; + char method = 'a'; + int transpose = 0; + int ok = -1; + + static char* kwlist[] = {"data", + "mask", + "clusterid", + "method", + "transpose", + "cdata", + "cmask", + NULL }; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&iO&O&", kwlist, + data_converter, &data, + mask_converter, &mask, + index_converter, &clusterid, + method_kcluster_converter, &method, + &transpose, + data_converter, &cdata, + mask_converter, &cmask)) return NULL; + if (!data.values) { + PyErr_SetString(PyExc_RuntimeError, "data is None"); + goto exit; + } + if (!mask.values) { + PyErr_SetString(PyExc_RuntimeError, "mask is None"); + goto exit; + } + nrows = data.nrows; + ncols = data.ncols; + if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { + PyErr_Format(PyExc_ValueError, + "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", + mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); + goto exit; + } + if (transpose == 0) { + nclusters = check_clusterid(clusterid, nrows); + nrows = nclusters; + } + else { + nclusters = check_clusterid(clusterid, ncols); + ncols = nclusters; + } + if (nclusters == 0) goto exit; + if (cdata.nrows != nrows) { + PyErr_Format(PyExc_RuntimeError, + "cdata has incorrect number of rows (%d, expected %d)", + cdata.nrows, nrows); + goto exit; + } + if (cdata.ncols != ncols) { + PyErr_Format(PyExc_RuntimeError, + "cdata has incorrect number of columns (%d, expected %d)", + cdata.ncols, ncols); + goto exit; + } + if (cmask.view.shape[0] != nrows) { + PyErr_Format(PyExc_RuntimeError, + "cmask has incorrect number of rows (%zd, expected %d)", + cmask.view.shape[0], nrows); + goto exit; + } + if (cmask.view.shape[1] != ncols) { + PyErr_Format(PyExc_RuntimeError, + "cmask has incorrect number of columns " + "(%zd, expected %d)", cmask.view.shape[1], ncols); + goto exit; + } + ok = getclustercentroids(nclusters, + data.nrows, + data.ncols, + data.values, + mask.values, + clusterid.buf, + cdata.values, + cmask.values, + transpose, + method); +exit: + data_converter(NULL, &data); + mask_converter(NULL, &mask); + data_converter(NULL, &cdata); + mask_converter(NULL, &cmask); + index_converter(NULL, &clusterid); + if (ok == -1) return NULL; + if (ok == 0) return PyErr_NoMemory(); + Py_INCREF(Py_None); + return Py_None; +} +/* end of wrapper for clustercentroids */ + +/* distancematrix */ +static char distancematrix__doc__[] = +"distancematrix(data, mask, weight, transpose, dist, distancematrix)\n" +" -> None\n" +"\n" +"This function calculuates the distance matrix between the data values.\n" +"\n" +"Arguments:\n" +"\n" +" - data: nrows x ncols array containing the data values.\n" +"\n" +" - mask: nrows x ncols array of integers, showing which data are\n" +" missing. If mask[i,j] == 0, then data[i,j] is missing.\n" +"\n" +" - weight: the weights to be used when calculating distances.\n" +"\n" +" - transpose: if equal to 0: the distances between rows are\n" +" calculated;\n" +" if equal to 1, the distances between columns are calculated.\n" +"\n" +" - dist: specifies the distance function to be used:\n" +"\n" +" - dist == 'e': Euclidean distance\n" +" - dist == 'b': City Block distance\n" +" - dist == 'c': Pearson correlation\n" +" - dist == 'a': absolute value of the correlation\n" +" - dist == 'u': uncentered correlation\n" +" - dist == 'x': absolute uncentered correlation\n" +" - dist == 's': Spearman's rank correlation\n" +" - dist == 'k': Kendall's tau\n" +"\n" +" - distancematrix: Upon return, the distance matrix as a list of 1D\n" +" arrays. The number of columns in each row is equal to the row number\n" +" (i.e., len(distancematrix[i]) == i).\n" +" An example of the return value is:\n" +"\n" +" matrix = [[],\n" +" array([1.]),\n" +" array([7., 3.]),\n" +" array([4., 2., 6.])]\n" +"\n" +"This corresponds to the distance matrix:\n" +"\n" +" [0.\t1.\t7.\t4.]\n" +" [1.\t0.\t3.\t2.]\n" +" [7.\t3.\t0.\t6.]\n" +" [4.\t2.\t6.\t0.]\n"; + +static PyObject* +py_distancematrix(PyObject* self, PyObject* args, PyObject* keywords) +{ + PyObject* list; + Distancematrix distances = {0}; + Data data = {0}; + Mask mask = {0}; + Py_buffer weight = {0}; + int transpose = 0; + char dist = 'e'; + int nrows, ncols, ndata; + PyObject* result = NULL; + + /* -- Read the input variables --------------------------------------- */ + static char* kwlist[] = {"data", + "mask", + "weight", + "transpose", + "dist", + "distancematrix", + NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&iO&O!", kwlist, + data_converter, &data, + mask_converter, &mask, + vector_converter, &weight, + &transpose, + distance_converter, &dist, + &PyList_Type, &list)) return NULL; + if (!data.values) { + PyErr_SetString(PyExc_RuntimeError, "data is None"); + goto exit; + } + if (!mask.values) { + PyErr_SetString(PyExc_RuntimeError, "mask is None"); + goto exit; + } + nrows = data.nrows; + ncols = data.ncols; + if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { + PyErr_Format(PyExc_ValueError, + "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", + mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); + goto exit; + } + ndata = (transpose == 0) ? ncols : nrows; + if (weight.shape[0] != ndata) { + PyErr_Format(PyExc_ValueError, + "weight has incorrect size %zd (expected %d)", + weight.shape[0], ndata); + goto exit; + } + if (_convert_list_to_distancematrix(list, &distances) == 0) goto exit; + + distancematrix(nrows, + ncols, + data.values, + mask.values, + weight.buf, + dist, + transpose, + distances.values); + + Py_INCREF(Py_None); + result = Py_None; +exit: + data_converter(NULL, &data); + mask_converter(NULL, &mask); + vector_converter(NULL, &weight); + distancematrix_converter(NULL, &distances); + return result; +} +/* end of wrapper for distancematrix */ + +/* pca */ +static char pca__doc__[] = +"pca(data, columnmean, coordinates, pc, eigenvalues) -> None\n" +"\n" +"This function calculates the principal component decomposition\n" +"of the values in data.\n" +"\n" +"Arguments:\n" +"\n" +" - data: nrows x ncols array containing the data values.\n" +"\n" +" - columnmean: array of size nrows) in which the mean of each column\n" +" will be sorted.\n" +"\n" +" - coordinates: nrows x nmin array in which the coordinates of the\n" +" data along the principal components will be stored;\n" +" nmin is min(nrows, ncols).\n" +"\n" +" - pc : the principal components as an nmin x ncols array, where nmin\n" +" is min(nrows, ncols).\n" +"\n" +" - eigenvalues: array of size min(nrows, ncols), in which the\n" +" eigenvalues will be stored, sorted by the magnitude\n" +" of the eigenvalues, with the largest eigenvalues\n" +" appearing first.\n" +"\n" +"Adding the column means to the dot product of the coordinates and the\n" +"principal components, i.e.\n" +"\n" +" columnmean + dot(coordinates, pc)\n" +"\n" +"recreates the data matrix.\n"; + +static PyObject* +py_pca(PyObject* self, PyObject* args) +{ + Py_buffer eigenvalues = {0}; + double** u; + double** v; + Data data = {0}; + Data pc = {0}; + Data coordinates = {0}; + Py_buffer mean = {0}; + int nrows, ncols; + int nmin; + int error = -2; + double* p; + double** values; + int i, j; + + if (!PyArg_ParseTuple(args, "O&O&O&O&O&", + data_converter, &data, + vector_converter, &mean, + data_converter, &coordinates, + data_converter, &pc, + vector_converter, &eigenvalues)) return NULL; + + values = data.values; + if (!values) { + PyErr_SetString(PyExc_RuntimeError, "data is None"); + goto exit; + } + nrows = data.nrows; + ncols = data.ncols; + if (mean.shape[0] != ncols) { + PyErr_Format(PyExc_RuntimeError, + "columnmean has inconsistent size %zd (expected %d)", + mean.shape[0], ncols); + goto exit; + } + nmin = nrows < ncols ? nrows : ncols; + if (pc.nrows != nmin || pc.ncols != ncols) { + PyErr_Format(PyExc_RuntimeError, + "pc has inconsistent size %zd x %zd (expected %d x %d)", + mean.shape[0], mean.shape[1], nmin, ncols); + goto exit; + } + if (coordinates.nrows != nrows || coordinates.ncols != nmin) { + PyErr_Format(PyExc_RuntimeError, + "coordinates has inconsistent size %zd x %zd (expected %d x %d)", + mean.shape[0], mean.shape[1], nrows, nmin); + goto exit; + } + if (nrows >= ncols) { + u = coordinates.values; + v = pc.values; + } + else { /* nrows < ncolums */ + u = pc.values; + v = coordinates.values; + } + /* -- Calculate the mean of each column ------------------------------ */ + p = mean.buf; + for (j = 0; j < ncols; j++) { + p[j] = 0.0; + for (i = 0; i < nrows; i++) p[j] += values[i][j]; + p[j] /= nrows; + } + /* -- Subtract the mean of each column ----------------------------- */ + for (i = 0; i < nrows; i++) + for (j = 0; j < ncols; j++) + u[i][j] = values[i][j] - p[j]; + /* -- Perform the principal component analysis ----------------------- */ + error = pca(nrows, ncols, u, v, eigenvalues.buf); + /* ------------------------------------------------------------------- */ +exit: + data_converter(NULL, &data); + vector_converter(NULL, &mean); + data_converter(NULL, &pc); + data_converter(NULL, &coordinates); + vector_converter(NULL, &eigenvalues); + if (error == 0) { + Py_INCREF(Py_None); + return Py_None; + } + if (error == -1) return PyErr_NoMemory(); + else if (error > 0) + PyErr_SetString(PyExc_RuntimeError, + "Singular value decomposition failed to converge"); + return NULL; +} +/* end of wrapper for pca */ + +/* ========================================================================= */ +/* -- The methods table ---------------------------------------------------- */ +/* ========================================================================= */ + + +static struct PyMethodDef cluster_methods[] = { + {"version", (PyCFunction) py_version, METH_NOARGS, version__doc__}, + {"kcluster", + (PyCFunction) py_kcluster, + METH_VARARGS | METH_KEYWORDS, + kcluster__doc__ + }, + {"kmedoids", + (PyCFunction) py_kmedoids, + METH_VARARGS | METH_KEYWORDS, + kmedoids__doc__ + }, + {"treecluster", + (PyCFunction) py_treecluster, + METH_VARARGS | METH_KEYWORDS, + treecluster__doc__ + }, + {"somcluster", + (PyCFunction) py_somcluster, + METH_VARARGS | METH_KEYWORDS, + somcluster__doc__ + }, + {"clusterdistance", + (PyCFunction) py_clusterdistance, + METH_VARARGS | METH_KEYWORDS, + clusterdistance__doc__ + }, + {"clustercentroids", + (PyCFunction) py_clustercentroids, + METH_VARARGS | METH_KEYWORDS, + clustercentroids__doc__ + }, + {"distancematrix", + (PyCFunction) py_distancematrix, + METH_VARARGS | METH_KEYWORDS, + distancematrix__doc__ + }, + {"pca", + (PyCFunction) py_pca, + METH_VARARGS | METH_KEYWORDS, + pca__doc__ + }, + {NULL, NULL, 0, NULL} /* sentinel */ +}; + +/* ========================================================================= */ +/* -- Initialization ------------------------------------------------------- */ +/* ========================================================================= */ + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "_cluster", + "C Clustering Library", + -1, + cluster_methods, + NULL, + NULL, + NULL, + NULL +}; + +PyObject * +PyInit__cluster(void) +{ + PyObject *module; + + PyNodeType.tp_new = PyType_GenericNew; + if (PyType_Ready(&PyNodeType) < 0) + return NULL; + if (PyType_Ready(&PyTreeType) < 0) + return NULL; + + module = PyModule_Create(&moduledef); + if (module == NULL) return NULL; + + Py_INCREF(&PyTreeType); + if (PyModule_AddObject(module, "Tree", (PyObject*) &PyTreeType) < 0) { + Py_DECREF(module); + Py_DECREF(&PyTreeType); + return NULL; + } + + Py_INCREF(&PyNodeType); + if (PyModule_AddObject(module, "Node", (PyObject*) &PyNodeType) < 0) { + Py_DECREF(module); + Py_DECREF(&PyNodeType); + return NULL; + } + + return module; +} diff --git a/code/lib/Bio/Compass/__init__.py b/code/lib/Bio/Compass/__init__.py new file mode 100644 index 0000000..3d5e37a --- /dev/null +++ b/code/lib/Bio/Compass/__init__.py @@ -0,0 +1,223 @@ +# Copyright 2004 by James Casbon. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Code to deal with COMPASS output, a program for profile/profile comparison. + +Compass is described in: + +Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein +alignments with assessment of statistical significance. J Mol Biol. 2003 Feb +7;326(1):317-36. + +Tested with COMPASS 1.24. +""" + +import re + + +def read(handle): + """Read a COMPASS file containing one COMPASS record.""" + record = None + try: + line = next(handle) + record = Record() + __read_names(record, line) + line = next(handle) + __read_threshold(record, line) + line = next(handle) + __read_lengths(record, line) + line = next(handle) + __read_profilewidth(record, line) + line = next(handle) + __read_scores(record, line) + except StopIteration: + if not record: + raise ValueError("No record found in handle") from None + else: + raise ValueError("Unexpected end of stream.") from None + for line in handle: + if not line.strip(): # skip empty lines + continue + __read_query_alignment(record, line) + try: + line = next(handle) + __read_positive_alignment(record, line) + line = next(handle) + __read_hit_alignment(record, line) + except StopIteration: + raise ValueError("Unexpected end of stream.") from None + return record + + +def parse(handle): + """Iterate over records in a COMPASS file.""" + record = None + try: + line = next(handle) + except StopIteration: + return + while True: + try: + record = Record() + __read_names(record, line) + line = next(handle) + __read_threshold(record, line) + line = next(handle) + __read_lengths(record, line) + line = next(handle) + __read_profilewidth(record, line) + line = next(handle) + __read_scores(record, line) + except StopIteration: + raise ValueError("Unexpected end of stream.") from None + for line in handle: + if not line.strip(): + continue + if "Ali1:" in line: + yield record + break + __read_query_alignment(record, line) + try: + line = next(handle) + __read_positive_alignment(record, line) + line = next(handle) + __read_hit_alignment(record, line) + except StopIteration: + raise ValueError("Unexpected end of stream.") from None + else: + yield record + break + + +class Record: + """Hold information from one compass hit. + + Ali1 is the query, Ali2 the hit. + """ + + def __init__(self): + """Initialize the class.""" + self.query = "" + self.hit = "" + self.gap_threshold = 0 + self.query_length = 0 + self.query_filtered_length = 0 + self.query_nseqs = 0 + self.query_neffseqs = 0 + self.hit_length = 0 + self.hit_filtered_length = 0 + self.hit_nseqs = 0 + self.hit_neffseqs = 0 + self.sw_score = 0 + self.evalue = -1 + self.query_start = -1 + self.hit_start = -1 + self.query_aln = "" + self.hit_aln = "" + self.positives = "" + + def query_coverage(self): + """Return the length of the query covered in the alignment.""" + s = self.query_aln.replace("=", "") + return len(s) + + def hit_coverage(self): + """Return the length of the hit covered in the alignment.""" + s = self.hit_aln.replace("=", "") + return len(s) + + +# Everything below is private + +__regex = { + "names": re.compile(r"Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+"), + "threshold": re.compile(r"Threshold of effective gap content in columns: (\S+)"), + "lengths": re.compile( + r"length1=(\S+)\s+filtered_length1=(\S+)" + r"\s+length2=(\S+)\s+filtered_length2=(\S+)" + ), + "profilewidth": re.compile( + r"Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)" + ), + "scores": re.compile(r"Smith-Waterman score = (\S+)\s+Evalue = (\S+)"), + "start": re.compile(r"(\d+)"), + "align": re.compile(r"^.{15}(\S+)"), + "positive_alignment": re.compile(r"^.{15}(.+)"), +} + + +def __read_names(record, line): + # Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln + # ------query----- -------hit------------- + if "Ali1:" not in line: + raise ValueError("Line does not contain 'Ali1:':\n%s" % line) + m = __regex["names"].search(line) + record.query = m.group(1) + record.hit = m.group(2) + + +def __read_threshold(record, line): + if not line.startswith("Threshold"): + raise ValueError("Line does not start with 'Threshold':\n%s" % line) + m = __regex["threshold"].search(line) + record.gap_threshold = float(m.group(1)) + + +def __read_lengths(record, line): + if not line.startswith("length1="): + raise ValueError("Line does not start with 'length1=':\n%s" % line) + m = __regex["lengths"].search(line) + record.query_length = int(m.group(1)) + record.query_filtered_length = float(m.group(2)) + record.hit_length = int(m.group(3)) + record.hit_filtered_length = float(m.group(4)) + + +def __read_profilewidth(record, line): + if "Nseqs1" not in line: + raise ValueError("Line does not contain 'Nseqs1':\n%s" % line) + m = __regex["profilewidth"].search(line) + record.query_nseqs = int(m.group(1)) + record.query_neffseqs = float(m.group(2)) + record.hit_nseqs = int(m.group(3)) + record.hit_neffseqs = float(m.group(4)) + + +def __read_scores(record, line): + if not line.startswith("Smith-Waterman"): + raise ValueError("Line does not start with 'Smith-Waterman':\n%s" % line) + m = __regex["scores"].search(line) + if m: + record.sw_score = int(m.group(1)) + record.evalue = float(m.group(2)) + else: + record.sw_score = 0 + record.evalue = -1.0 + + +def __read_query_alignment(record, line): + m = __regex["start"].search(line) + if m: + record.query_start = int(m.group(1)) + m = __regex["align"].match(line) + assert m is not None, "invalid match" + record.query_aln += m.group(1) + + +def __read_positive_alignment(record, line): + m = __regex["positive_alignment"].match(line) + assert m is not None, "invalid match" + record.positives += m.group(1) + + +def __read_hit_alignment(record, line): + m = __regex["start"].search(line) + if m: + record.hit_start = int(m.group(1)) + m = __regex["align"].match(line) + assert m is not None, "invalid match" + record.hit_aln += m.group(1) diff --git a/code/lib/Bio/Compass/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Compass/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..fea698f Binary files /dev/null and b/code/lib/Bio/Compass/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Data/CodonTable.py b/code/lib/Bio/Data/CodonTable.py new file mode 100644 index 0000000..bc006ee --- /dev/null +++ b/code/lib/Bio/Data/CodonTable.py @@ -0,0 +1,1313 @@ +# Copyright 2000 Andrew Dalke. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Codon tables based on those from the NCBI. + +These tables are based on parsing the NCBI file +ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt +using Scripts/update_ncbi_codon_table.py + +Last updated at Version 4.4 (May 2019) +""" + +from Bio.Data import IUPACData + + +unambiguous_dna_by_name = {} +unambiguous_dna_by_id = {} +unambiguous_rna_by_name = {} +unambiguous_rna_by_id = {} +generic_by_name = {} # unambiguous DNA or RNA +generic_by_id = {} # unambiguous DNA or RNA + +ambiguous_dna_by_name = {} +ambiguous_dna_by_id = {} +ambiguous_rna_by_name = {} +ambiguous_rna_by_id = {} +ambiguous_generic_by_name = {} # ambiguous DNA or RNA +ambiguous_generic_by_id = {} # ambiguous DNA or RNA + +# standard IUPAC unambiguous codons +standard_dna_table = None +standard_rna_table = None + + +# In the future, the back_table could return a statistically +# appropriate distribution of codons, so do not cache the results of +# back_table lookups! + + +class TranslationError(Exception): + """Container for translation specific exceptions.""" + + pass + + +class CodonTable: + """A codon-table, or genetic code.""" + + forward_table = {} # only includes codons which actually code + back_table = {} # for back translations + start_codons = [] + stop_codons = [] + + # Not always called from derived classes! + def __init__( + self, + nucleotide_alphabet=None, + protein_alphabet=None, + forward_table=forward_table, + back_table=back_table, + start_codons=start_codons, + stop_codons=stop_codons, + ): + """Initialize the class.""" + self.nucleotide_alphabet = nucleotide_alphabet + self.protein_alphabet = protein_alphabet + self.forward_table = forward_table + self.back_table = back_table + self.start_codons = start_codons + self.stop_codons = stop_codons + + def __str__(self): + """Return a simple text representation of the codon table. + + e.g.:: + + >>> import Bio.Data.CodonTable + >>> print(Bio.Data.CodonTable.standard_dna_table) + Table 1 Standard, SGC0 + + | T | C | A | G | + --+---------+---------+---------+---------+-- + T | TTT F | TCT S | TAT Y | TGT C | T + T | TTC F | TCC S | TAC Y | TGC C | C + ... + G | GTA V | GCA A | GAA E | GGA G | A + G | GTG V | GCG A | GAG E | GGG G | G + --+---------+---------+---------+---------+-- + >>> print(Bio.Data.CodonTable.generic_by_id[1]) + Table 1 Standard, SGC0 + + | U | C | A | G | + --+---------+---------+---------+---------+-- + U | UUU F | UCU S | UAU Y | UGU C | U + U | UUC F | UCC S | UAC Y | UGC C | C + ... + G | GUA V | GCA A | GAA E | GGA G | A + G | GUG V | GCG A | GAG E | GGG G | G + --+---------+---------+---------+---------+-- + """ + if self.id: + answer = "Table %i" % self.id + else: + answer = "Table ID unknown" + if self.names: + answer += " " + ", ".join([x for x in self.names if x]) + + # Use the main four letters (and the conventional ordering) + # even for ambiguous tables + letters = self.nucleotide_alphabet + if letters is not None and "T" in letters: + letters = "TCAG" + else: + # Should be either RNA or generic nucleotides, + # e.g. Bio.Data.CodonTable.generic_by_id[1] + letters = "UCAG" + + # Build the table... + answer += "\n\n" + answer += " |" + "|".join(" %s " % c2 for c2 in letters) + "|" + answer += "\n--+" + "+".join("---------" for c2 in letters) + "+--" + for c1 in letters: + for c3 in letters: + line = c1 + " |" + for c2 in letters: + codon = c1 + c2 + c3 + line += " %s" % codon + if codon in self.stop_codons: + line += " Stop|" + else: + try: + amino = self.forward_table[codon] + except KeyError: + amino = "?" + except TranslationError: + amino = "?" + if codon in self.start_codons: + line += " %s(s)|" % amino + else: + line += " %s |" % amino + line += " " + c3 + answer += "\n" + line + answer += "\n--+" + "+".join("---------" for c2 in letters) + "+--" + return answer + + +def make_back_table(table, default_stop_codon): + """Back a back-table (naive single codon mapping). + + ONLY RETURNS A SINGLE CODON, chosen from the possible alternatives + based on their sort order. + """ + # Do the sort so changes in the hash implementation won't affect + # the result when one amino acid is coded by more than one codon. + back_table = {} + for key in sorted(table): + back_table[table[key]] = key + back_table[None] = default_stop_codon + return back_table + + +class NCBICodonTable(CodonTable): + """Codon table for generic nucleotide sequences.""" + + nucleotide_alphabet = None + protein_alphabet = IUPACData.protein_letters + + def __init__(self, id, names, table, start_codons, stop_codons): + """Initialize the class.""" + self.id = id + self.names = names + self.forward_table = table + self.back_table = make_back_table(table, stop_codons[0]) + self.start_codons = start_codons + self.stop_codons = stop_codons + + def __repr__(self): + """Represent the NCBI codon table class as a string for debugging.""" + return "%s(id=%r, names=%r, ...)" % ( + self.__class__.__name__, + self.id, + self.names, + ) + + +class NCBICodonTableDNA(NCBICodonTable): + """Codon table for unambiguous DNA sequences.""" + + nucleotide_alphabet = IUPACData.unambiguous_dna_letters + + +class NCBICodonTableRNA(NCBICodonTable): + """Codon table for unambiguous RNA sequences.""" + + nucleotide_alphabet = IUPACData.unambiguous_rna_letters + + +# ######## Deal with ambiguous forward translations + + +class AmbiguousCodonTable(CodonTable): + """Base codon table for ambiguous sequences.""" + + def __init__( + self, + codon_table, + ambiguous_nucleotide_alphabet, + ambiguous_nucleotide_values, + ambiguous_protein_alphabet, + ambiguous_protein_values, + ): + """Initialize the class.""" + CodonTable.__init__( + self, + ambiguous_nucleotide_alphabet, + ambiguous_protein_alphabet, + AmbiguousForwardTable( + codon_table.forward_table, + ambiguous_nucleotide_values, + ambiguous_protein_values, + ), + codon_table.back_table, + # These two are WRONG! I need to get the + # list of ambiguous codons which code for + # the stop codons XXX + list_ambiguous_codons( + codon_table.start_codons, ambiguous_nucleotide_values + ), + list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values), + ) + self._codon_table = codon_table + + # Be sneaky and forward attribute lookups to the original table. + # This lets us get the names, if the original table is an NCBI + # table. + def __getattr__(self, name): + """Forward attribute lookups to the original table.""" + return getattr(self._codon_table, name) + + +def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values): + """Return all possible encoded amino acids for ambiguous codon.""" + c1, c2, c3 = codon + x1 = ambiguous_nucleotide_values[c1] + x2 = ambiguous_nucleotide_values[c2] + x3 = ambiguous_nucleotide_values[c3] + possible = {} + stops = [] + for y1 in x1: + for y2 in x2: + for y3 in x3: + try: + possible[forward_table[y1 + y2 + y3]] = 1 + except KeyError: + # If tripping over a stop codon + stops.append(y1 + y2 + y3) + if stops: + if possible: + raise TranslationError( + "ambiguous codon %r codes for both proteins and stop codons" % codon + ) + # This is a true stop codon - tell the caller about it + raise KeyError(codon) + return list(possible) + + +def list_ambiguous_codons(codons, ambiguous_nucleotide_values): + """Extend a codon list to include all possible ambigous codons. + + e.g.:: + + ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR'] + ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA'] + + Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR' + (which could also mean 'TAA' or 'TGG'). + Thus only two more codons are added in the following: + + e.g.:: + + ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR'] + + Returns a new (longer) list of codon strings. + """ + # Note ambiguous_nucleotide_values['R'] = 'AG' (etc) + # This will generate things like 'TRR' from ['TAG', 'TGA'], which + # we don't want to include: + c1_list = sorted( + letter + for letter, meanings in ambiguous_nucleotide_values.items() + if {codon[0] for codon in codons}.issuperset(set(meanings)) + ) + c2_list = sorted( + letter + for letter, meanings in ambiguous_nucleotide_values.items() + if {codon[1] for codon in codons}.issuperset(set(meanings)) + ) + c3_list = sorted( + letter + for letter, meanings in ambiguous_nucleotide_values.items() + if {codon[2] for codon in codons}.issuperset(set(meanings)) + ) + # candidates is a list (not a set) to preserve the iteration order + candidates = [] + for c1 in c1_list: + for c2 in c2_list: + for c3 in c3_list: + codon = c1 + c2 + c3 + if codon not in candidates and codon not in codons: + candidates.append(codon) + answer = codons[:] # copy + # print("Have %i new candidates" % len(candidates)) + for ambig_codon in candidates: + wanted = True + # e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG' + for codon in [ + c1 + c2 + c3 + for c1 in ambiguous_nucleotide_values[ambig_codon[0]] + for c2 in ambiguous_nucleotide_values[ambig_codon[1]] + for c3 in ambiguous_nucleotide_values[ambig_codon[2]] + ]: + if codon not in codons: + # This ambiguous codon can code for a non-stop, exclude it! + wanted = False + # print("Rejecting %s" % ambig_codon) + continue + if wanted: + answer.append(ambig_codon) + return answer + + +assert list_ambiguous_codons(["TGA", "TAA"], IUPACData.ambiguous_dna_values) == [ + "TGA", + "TAA", + "TRA", +] +assert list_ambiguous_codons(["TAG", "TGA"], IUPACData.ambiguous_dna_values) == [ + "TAG", + "TGA", +] +assert list_ambiguous_codons(["TAG", "TAA"], IUPACData.ambiguous_dna_values) == [ + "TAG", + "TAA", + "TAR", +] +assert list_ambiguous_codons(["UAG", "UAA"], IUPACData.ambiguous_rna_values) == [ + "UAG", + "UAA", + "UAR", +] +assert list_ambiguous_codons(["TGA", "TAA", "TAG"], IUPACData.ambiguous_dna_values) == [ + "TGA", + "TAA", + "TAG", + "TAR", + "TRA", +] + +# Forward translation is "onto", that is, any given codon always maps +# to the same protein, or it doesn't map at all. Thus, I can build +# off of an existing table to produce the ambiguous mappings. +# +# This handles the general case. Perhaps it's overkill? +# >>> t = CodonTable.ambiguous_dna_by_id[1] +# >>> t.forward_table["AAT"] +# 'N' +# >>> t.forward_table["GAT"] +# 'D' +# >>> t.forward_table["RAT"] +# 'B' +# >>> t.forward_table["YTA"] +# 'L' + + +class AmbiguousForwardTable: + """Forward table for translation of ambiguous nucleotide sequences.""" + + def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein): + """Initialize the class.""" + self.forward_table = forward_table + + self.ambiguous_nucleotide = ambiguous_nucleotide + self.ambiguous_protein = ambiguous_protein + + inverted = {} + for name, val in ambiguous_protein.items(): + for c in val: + x = inverted.get(c, {}) + x[name] = 1 + inverted[c] = x + for name, val in inverted.items(): + inverted[name] = list(val) + self._inverted = inverted + + self._cache = {} + + def __contains__(self, codon): + """Check if codon works as key for ambiguous forward_table. + + Only returns 'True' if forward_table[codon] returns a value. + """ + try: + self.__getitem__(codon) + return True + except (KeyError, TranslationError): + return False + + def get(self, codon, failobj=None): + """Implement get for dictionary-like behaviour.""" + try: + return self.__getitem__(codon) + except KeyError: + return failobj + + def __getitem__(self, codon): + """Implement dictionary-like behaviour for AmbiguousForwardTable. + + forward_table[codon] will either return an amino acid letter, + or throws a KeyError (if codon does not encode an amino acid) + or a TranslationError (if codon does encode for an amino acid, + but either is also a stop codon or does encode several amino acids, + for which no unique letter is available in the given alphabet. + """ + try: + x = self._cache[codon] + except KeyError: + pass + else: + if x is TranslationError: + raise TranslationError(codon) # no unique translation + if x is KeyError: + raise KeyError(codon) # it's a stop codon + return x + try: + x = self.forward_table[codon] + self._cache[codon] = x + return x + except KeyError: + pass + + # XXX Need to make part of this into a method which returns + # a list of all possible encodings for a codon! + try: + possible = list_possible_proteins( + codon, self.forward_table, self.ambiguous_nucleotide + ) + except KeyError: + self._cache[codon] = KeyError + raise KeyError(codon) from None # stop codon + except TranslationError: + self._cache[codon] = TranslationError + raise TranslationError(codon) # does not code + assert len(possible) > 0, "unambiguous codons must code" + + # Hah! Only one possible protein, so use it + if len(possible) == 1: + self._cache[codon] = possible[0] + return possible[0] + + # See if there's an ambiguous protein encoding for the multiples. + # Find residues which exist in every coding set. + ambiguous_possible = {} + for amino in possible: + for term in self._inverted[amino]: + ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1 + + n = len(possible) + possible = [] + for amino, val in ambiguous_possible.items(): + if val == n: + possible.append(amino) + + # No amino acid encoding for the results + if len(possible) == 0: + self._cache[codon] = TranslationError + raise TranslationError(codon) # no valid translation + + # All of these are valid, so choose one + # To be unique, sort by smallet ambiguity then alphabetically + # Can get this if "X" encodes for everything. + # def _sort(x, y, table = self.ambiguous_protein): + # a = cmp(len(table[x]), len(table[y])) + # if a == 0: + # return cmp(x, y) + # return a + + # Sort by key is 2.x and 3.x compatible + possible.sort(key=lambda x: (len(self.ambiguous_protein[x]), x)) + + x = possible[0] + self._cache[codon] = x + return x + + +def register_ncbi_table(name, alt_name, id, table, start_codons, stop_codons): + """Turn codon table data into objects (PRIVATE). + + The data is stored in the dictionaries. + """ + # In most cases names are divided by "; ", however there is also + # Table 11 'Bacterial, Archaeal and Plant Plastid Code', previously + # 'Bacterial and Plant Plastid' which used to be just 'Bacterial' + names = [ + x.strip() for x in name.replace(" and ", "; ").replace(", ", "; ").split("; ") + ] + + dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons, stop_codons) + ambig_dna = AmbiguousCodonTable( + dna, + IUPACData.ambiguous_dna_letters, + IUPACData.ambiguous_dna_values, + IUPACData.extended_protein_letters, + IUPACData.extended_protein_values, + ) + + # replace all T's with U's for the RNA tables + rna_table = {} + generic_table = {} + for codon, val in table.items(): + generic_table[codon] = val + codon = codon.replace("T", "U") + generic_table[codon] = val + rna_table[codon] = val + rna_start_codons = [] + generic_start_codons = [] + for codon in start_codons: + generic_start_codons.append(codon) + # We need to check if 'T' is in the codon, otherwise + # generic_start_codons may contain duplicates + if "T" in codon: + codon = codon.replace("T", "U") + generic_start_codons.append(codon) + rna_start_codons.append(codon) + rna_stop_codons = [] + generic_stop_codons = [] + for codon in stop_codons: + generic_stop_codons.append(codon) + if "T" in codon: + codon = codon.replace("T", "U") + generic_stop_codons.append(codon) + rna_stop_codons.append(codon) + + generic = NCBICodonTable( + id, names + [alt_name], generic_table, generic_start_codons, generic_stop_codons + ) + + # The following isn't very elegant, but seems to work nicely. + _merged_values = dict(IUPACData.ambiguous_rna_values.items()) + _merged_values["T"] = "U" + ambig_generic = AmbiguousCodonTable( + generic, + None, + _merged_values, + IUPACData.extended_protein_letters, + IUPACData.extended_protein_values, + ) + + rna = NCBICodonTableRNA( + id, names + [alt_name], rna_table, rna_start_codons, rna_stop_codons + ) + + ambig_rna = AmbiguousCodonTable( + rna, + IUPACData.ambiguous_rna_letters, + IUPACData.ambiguous_rna_values, + IUPACData.extended_protein_letters, + IUPACData.extended_protein_values, + ) + + if id == 1: + global standard_dna_table, standard_rna_table + standard_dna_table = dna + standard_rna_table = rna + + unambiguous_dna_by_id[id] = dna + unambiguous_rna_by_id[id] = rna + generic_by_id[id] = generic + ambiguous_dna_by_id[id] = ambig_dna + ambiguous_rna_by_id[id] = ambig_rna + ambiguous_generic_by_id[id] = ambig_generic + + if alt_name is not None: + names.append(alt_name) + + for name in names: + unambiguous_dna_by_name[name] = dna + unambiguous_rna_by_name[name] = rna + generic_by_name[name] = generic + ambiguous_dna_by_name[name] = ambig_dna + ambiguous_rna_by_name[name] = ambig_rna + ambiguous_generic_by_name[name] = ambig_generic + + +# The rest of this file is automatically generated, here we turn off +# black formatting in order to keep the codon tables compact. +# +# fmt: off + +########################################################################## +# Start of auto-generated output from Scripts/update_ncbi_codon_table.py # +########################################################################## + +# Data from NCBI genetic code table version 4.5 + +register_ncbi_table( + name="Standard", + alt_name="SGC0", + id=1, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG", "TGA"], + start_codons=["TTG", "CTG", "ATG"], +) + +register_ncbi_table( + name="Vertebrate Mitochondrial", + alt_name="SGC1", + id=2, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", # noqa: E241 + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG", "AGA", "AGG"], + start_codons=["ATT", "ATC", "ATA", "ATG", "GTG"], +) + +register_ncbi_table( + name="Yeast Mitochondrial", + alt_name="SGC2", + id=3, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "T", "CTC": "T", "CTA": "T", "CTG": "T", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["ATA", "ATG", "GTG"], +) + +register_ncbi_table( + name="Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate " + "Mitochondrial; Mycoplasma; Spiroplasma", + alt_name="SGC3", + id=4, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["TTA", "TTG", "CTG", "ATT", "ATC", "ATA", "ATG", "GTG"], +) + +register_ncbi_table( + name="Invertebrate Mitochondrial", + alt_name="SGC4", + id=5, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "S", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["TTG", "ATT", "ATC", "ATA", "ATG", "GTG"], +) + +register_ncbi_table( + name="Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear", + alt_name="SGC5", + id=6, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAA": "Q", "TAG": "Q", + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TGA"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Echinoderm Mitochondrial; Flatworm Mitochondrial", + alt_name="SGC8", + id=9, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "N", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "S", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["ATG", "GTG"], +) + +register_ncbi_table( + name="Euplotid Nuclear", + alt_name="SGC9", + id=10, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "C", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Bacterial, Archaeal and Plant Plastid", + alt_name=None, + id=11, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG", "TGA"], + start_codons=["TTG", "CTG", "ATT", "ATC", "ATA", "ATG", "GTG"], +) + +register_ncbi_table( + name="Alternative Yeast Nuclear", + alt_name=None, + id=12, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "S", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG", "TGA"], + start_codons=["CTG", "ATG"], +) + +register_ncbi_table( + name="Ascidian Mitochondrial", + alt_name=None, + id=13, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "G", "AGG": "G", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["TTG", "ATA", "ATG", "GTG"], +) + +register_ncbi_table( + name="Alternative Flatworm Mitochondrial", + alt_name=None, + id=14, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAA": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "N", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "S", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAG"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Blepharisma Macronuclear", + alt_name=None, + id=15, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAG": "Q", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TGA"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Chlorophycean Mitochondrial", + alt_name=None, + id=16, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAG": "L", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TGA"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Trematode Mitochondrial", + alt_name=None, + id=21, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "N", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "S", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["ATG", "GTG"], +) + +register_ncbi_table( + name="Scenedesmus obliquus Mitochondrial", + alt_name=None, + id=22, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCG": "S", # noqa: E241 + "TAT": "Y", "TAC": "Y", "TAG": "L", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TCA", "TAA", "TGA"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Thraustochytrium Mitochondrial", + alt_name=None, + id=23, + table={ + "TTT": "F", "TTC": "F", "TTG": "L", # noqa: E241 + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TTA", "TAA", "TAG", "TGA"], + start_codons=["ATT", "ATG", "GTG"], +) + +register_ncbi_table( + name="Pterobranchia Mitochondrial", + alt_name=None, + id=24, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "K", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["TTG", "CTG", "ATG", "GTG"], +) + +register_ncbi_table( + name="Candidate Division SR1 and Gracilibacteria", + alt_name=None, + id=25, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "G", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["TTG", "ATG", "GTG"], +) + +register_ncbi_table( + name="Pachysolen tannophilus Nuclear", + alt_name=None, + id=26, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "A", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG", "TGA"], + start_codons=["CTG", "ATG"], +) + +register_ncbi_table( + name="Karyorelict Nuclear", + alt_name=None, + id=27, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAA": "Q", "TAG": "Q", + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TGA"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Condylostoma Nuclear", + alt_name=None, + id=28, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAA": "Q", "TAG": "Q", + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG", "TGA"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Mesodinium Nuclear", + alt_name=None, + id=29, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAA": "Y", "TAG": "Y", + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TGA"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Peritrich Nuclear", + alt_name=None, + id=30, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAA": "E", "TAG": "E", + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TGA"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Blastocrithidia Nuclear", + alt_name=None, + id=31, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAA": "E", "TAG": "E", + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TAG"], + start_codons=["ATG"], +) + +register_ncbi_table( + name="Balanophoraceae Plastid", + alt_name=None, + id=32, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAG": "W", # noqa: E241 + "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241 + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAA", "TGA"], + start_codons=["TTG", "CTG", "ATT", "ATC", "ATA", "ATG", "GTG"], +) + +register_ncbi_table( + name="Cephalodiscidae Mitochondrial", + alt_name=None, + id=33, + table={ + "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", + "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", + "TAT": "Y", "TAC": "Y", "TAA": "Y", # noqa: E241 + "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W", + "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", + "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", + "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", + "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", + "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", + "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", + "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", + "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "K", + "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", + "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", + "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", + "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", + }, + stop_codons=["TAG"], + start_codons=["TTG", "CTG", "ATG", "GTG"], +) + +######################################################################## +# End of auto-generated output from Scripts/update_ncbi_codon_table.py # +######################################################################## diff --git a/code/lib/Bio/Data/IUPACData.py b/code/lib/Bio/Data/IUPACData.py new file mode 100644 index 0000000..42bf7a9 --- /dev/null +++ b/code/lib/Bio/Data/IUPACData.py @@ -0,0 +1,423 @@ +# Copyright 2000 Andrew Dalke. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Information about the IUPAC alphabets.""" + + +protein_letters = "ACDEFGHIKLMNPQRSTVWY" +extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" +# B = "Asx"; aspartic acid or asparagine (D or N) +# X = "Xxx"; unknown or 'other' amino acid +# Z = "Glx"; glutamic acid or glutamine (E or Q) +# http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 +# +# J = "Xle"; leucine or isoleucine (L or I, used in NMR) +# Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html +# Also the International Nucleotide Sequence Database Collaboration (INSDC) +# (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 +# http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html +# +# Xle (J); Leucine or Isoleucine +# The residue abbreviations, Xle (the three-letter abbreviation) and J +# (the one-letter abbreviation) are reserved for the case that cannot +# experimentally distinguish leucine from isoleucine. +# +# U = "Sec"; selenocysteine +# http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html +# +# O = "Pyl"; pyrrolysine +# http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 + +protein_letters_1to3 = { + "A": "Ala", + "C": "Cys", + "D": "Asp", + "E": "Glu", + "F": "Phe", + "G": "Gly", + "H": "His", + "I": "Ile", + "K": "Lys", + "L": "Leu", + "M": "Met", + "N": "Asn", + "P": "Pro", + "Q": "Gln", + "R": "Arg", + "S": "Ser", + "T": "Thr", + "V": "Val", + "W": "Trp", + "Y": "Tyr", +} +protein_letters_1to3_extended = dict( + list(protein_letters_1to3.items()) + + list( + {"B": "Asx", "X": "Xaa", "Z": "Glx", "J": "Xle", "U": "Sec", "O": "Pyl"}.items() + ) +) + +protein_letters_3to1 = {x[1]: x[0] for x in protein_letters_1to3.items()} +protein_letters_3to1_extended = { + x[1]: x[0] for x in protein_letters_1to3_extended.items() +} + +ambiguous_dna_letters = "GATCRYWSMKHBVDN" +unambiguous_dna_letters = "GATC" +ambiguous_rna_letters = "GAUCRYWSMKHBVDN" +unambiguous_rna_letters = "GAUC" + +# B == 5-bromouridine +# D == 5,6-dihydrouridine +# S == thiouridine +# W == wyosine +extended_dna_letters = "GATCBDSW" + +# are there extended forms? +# extended_rna_letters = "GAUCBDSW" + +# "X" is included in the following _values and _complement dictionaries, +# for historical reasons although it is not an IUPAC nucleotide, +# and so is not in the corresponding _letters strings above +ambiguous_dna_values = { + "A": "A", + "C": "C", + "G": "G", + "T": "T", + "M": "AC", + "R": "AG", + "W": "AT", + "S": "CG", + "Y": "CT", + "K": "GT", + "V": "ACG", + "H": "ACT", + "D": "AGT", + "B": "CGT", + "X": "GATC", + "N": "GATC", +} +ambiguous_rna_values = { + "A": "A", + "C": "C", + "G": "G", + "U": "U", + "M": "AC", + "R": "AG", + "W": "AU", + "S": "CG", + "Y": "CU", + "K": "GU", + "V": "ACG", + "H": "ACU", + "D": "AGU", + "B": "CGU", + "X": "GAUC", + "N": "GAUC", +} + +ambiguous_dna_complement = { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "M": "K", + "R": "Y", + "W": "W", + "S": "S", + "Y": "R", + "K": "M", + "V": "B", + "H": "D", + "D": "H", + "B": "V", + "X": "X", + "N": "N", +} + +ambiguous_rna_complement = { + "A": "U", + "C": "G", + "G": "C", + "U": "A", + "M": "K", + "R": "Y", + "W": "W", + "S": "S", + "Y": "R", + "K": "M", + "V": "B", + "H": "D", + "D": "H", + "B": "V", + "X": "X", + "N": "N", +} + + +def _make_ranges(mydict): + d = {} + for key, value in mydict.items(): + d[key] = (value, value) + return d + + +# Mass data taken from PubChem + + +# Average masses of monophosphate deoxy nucleotides +unambiguous_dna_weights = {"A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085} + +# Monoisotopic masses of monophospate deoxy nucleotides +monoisotopic_unambiguous_dna_weights = { + "A": 331.06817, + "C": 307.056936, + "G": 347.063084, + "T": 322.056602, +} + +unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) + +unambiguous_rna_weights = {"A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813} + +monoisotopic_unambiguous_rna_weights = { + "A": 347.063084, + "C": 323.051851, + "G": 363.057999, + "U": 324.035867, +} + +unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) + + +def _make_ambiguous_ranges(mydict, weight_table): + range_d = {} + avg_d = {} + for letter, values in mydict.items(): + # Following line is a quick hack to skip undefined weights for U and O + if len(values) == 1 and values[0] not in weight_table: + continue + + weights = [weight_table.get(x) for x in values] + range_d[letter] = (min(weights), max(weights)) + total_w = 0.0 + for w in weights: + total_w = total_w + w + avg_d[letter] = total_w / len(weights) + return range_d, avg_d + + +ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = _make_ambiguous_ranges( + ambiguous_dna_values, unambiguous_dna_weights +) + +ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = _make_ambiguous_ranges( + ambiguous_rna_values, unambiguous_rna_weights +) + +protein_weights = { + "A": 89.0932, + "C": 121.1582, + "D": 133.1027, + "E": 147.1293, + "F": 165.1891, + "G": 75.0666, + "H": 155.1546, + "I": 131.1729, + "K": 146.1876, + "L": 131.1729, + "M": 149.2113, + "N": 132.1179, + "O": 255.3134, + "P": 115.1305, + "Q": 146.1445, + "R": 174.201, + "S": 105.0926, + "T": 119.1192, + "U": 168.0532, + "V": 117.1463, + "W": 204.2252, + "Y": 181.1885, +} + +monoisotopic_protein_weights = { + "A": 89.047678, + "C": 121.019749, + "D": 133.037508, + "E": 147.053158, + "F": 165.078979, + "G": 75.032028, + "H": 155.069477, + "I": 131.094629, + "K": 146.105528, + "L": 131.094629, + "M": 149.051049, + "N": 132.053492, + "O": 255.158292, + "P": 115.063329, + "Q": 146.069142, + "R": 174.111676, + "S": 105.042593, + "T": 119.058243, + "U": 168.964203, + "V": 117.078979, + "W": 204.089878, + "Y": 181.073893, +} + +extended_protein_values = { + "A": "A", + "B": "ND", + "C": "C", + "D": "D", + "E": "E", + "F": "F", + "G": "G", + "H": "H", + "I": "I", + "J": "IL", + "K": "K", + "L": "L", + "M": "M", + "N": "N", + "O": "O", + "P": "P", + "Q": "Q", + "R": "R", + "S": "S", + "T": "T", + "U": "U", + "V": "V", + "W": "W", + "X": "ACDEFGHIKLMNPQRSTVWY", + # TODO - Include U and O in the possible values of X? + # This could alter the extended_protein_weight_ranges ... + # by MP: Won't do this, because they are so rare. + "Y": "Y", + "Z": "QE", +} + +protein_weight_ranges = _make_ranges(protein_weights) + +extended_protein_weight_ranges, avg_extended_protein_weights = _make_ambiguous_ranges( + extended_protein_values, protein_weights +) + + +# For Center of Mass Calculation. +# Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol +atom_weights = { + "H": 1.00794, + "D": 2.01410, + "He": 4.002602, + "Li": 6.941, + "Be": 9.012182, + "B": 10.811, + "C": 12.0107, + "N": 14.0067, + "O": 15.9994, + "F": 18.9984032, + "Ne": 20.1797, + "Na": 22.989770, + "Mg": 24.3050, + "Al": 26.981538, + "Si": 28.0855, + "P": 30.973761, + "S": 32.065, + "Cl": 35.453, + "Ar": 39.948, + "K": 39.0983, + "Ca": 40.078, + "Sc": 44.955910, + "Ti": 47.867, + "V": 50.9415, + "Cr": 51.9961, + "Mn": 54.938049, + "Fe": 55.845, + "Co": 58.933200, + "Ni": 58.6934, + "Cu": 63.546, + "Zn": 65.39, + "Ga": 69.723, + "Ge": 72.64, + "As": 74.92160, + "Se": 78.96, + "Br": 79.904, + "Kr": 83.80, + "Rb": 85.4678, + "Sr": 87.62, + "Y": 88.90585, + "Zr": 91.224, + "Nb": 92.90638, + "Mo": 95.94, + "Tc": 98.0, + "Ru": 101.07, + "Rh": 102.90550, + "Pd": 106.42, + "Ag": 107.8682, + "Cd": 112.411, + "In": 114.818, + "Sn": 118.710, + "Sb": 121.760, + "Te": 127.60, + "I": 126.90447, + "Xe": 131.293, + "Cs": 132.90545, + "Ba": 137.327, + "La": 138.9055, + "Ce": 140.116, + "Pr": 140.90765, + "Nd": 144.24, + "Pm": 145.0, + "Sm": 150.36, + "Eu": 151.964, + "Gd": 157.25, + "Tb": 158.92534, + "Dy": 162.50, + "Ho": 164.93032, + "Er": 167.259, + "Tm": 168.93421, + "Yb": 173.04, + "Lu": 174.967, + "Hf": 178.49, + "Ta": 180.9479, + "W": 183.84, + "Re": 186.207, + "Os": 190.23, + "Ir": 192.217, + "Pt": 195.078, + "Au": 196.96655, + "Hg": 200.59, + "Tl": 204.3833, + "Pb": 207.2, + "Bi": 208.98038, + "Po": 208.98, + "At": 209.99, + "Rn": 222.02, + "Fr": 223.02, + "Ra": 226.03, + "Ac": 227.03, + "Th": 232.0381, + "Pa": 231.03588, + "U": 238.02891, + "Np": 237.05, + "Pu": 244.06, + "Am": 243.06, + "Cm": 247.07, + "Bk": 247.07, + "Cf": 251.08, + "Es": 252.08, + "Fm": 257.10, + "Md": 258.10, + "No": 259.10, + "Lr": 262.11, + "Rf": 261.11, + "Db": 262.11, + "Sg": 266.12, + "Bh": 264.12, + "Hs": 269.13, + "Mt": 268.14, +} diff --git a/code/lib/Bio/Data/SCOPData.py b/code/lib/Bio/Data/SCOPData.py new file mode 100644 index 0000000..79cfd4e --- /dev/null +++ b/code/lib/Bio/Data/SCOPData.py @@ -0,0 +1,277 @@ +# Copyright Lenna Peterson (2012) +# All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Additional protein alphabets used in the SCOP database and PDB files. + +See Bio.SCOP for more information about SCOP and Biopython's SCOP module. +""" + +# This file was automatically generated from PDB data. +# Black would reformat this to one entry per line, so tell it not to: +# fmt: off +protein_letters_3to1 = { + "00C": "C", "01W": "X", "02K": "A", "03Y": "C", "07O": "C", + "08P": "C", "0A0": "D", "0A1": "Y", "0A2": "K", "0A8": "C", + "0AA": "V", "0AB": "V", "0AC": "G", "0AD": "G", "0AF": "W", + "0AG": "L", "0AH": "S", "0AK": "D", "0AM": "A", "0AP": "C", + "0AU": "U", "0AV": "A", "0AZ": "P", "0BN": "F", "0C ": "C", + "0CS": "A", "0DC": "C", "0DG": "G", "0DT": "T", "0FL": "A", + "0G ": "G", "0NC": "A", "0SP": "A", "0U ": "U", "0YG": "YG", + "10C": "C", "125": "U", "126": "U", "127": "U", "128": "N", + "12A": "A", "143": "C", "175": "ASG", "193": "X", "1AP": "A", + "1MA": "A", "1MG": "G", "1PA": "F", "1PI": "A", "1PR": "N", + "1SC": "C", "1TQ": "W", "1TY": "Y", "1X6": "S", "200": "F", + "23F": "F", "23S": "X", "26B": "T", "2AD": "X", "2AG": "A", + "2AO": "X", "2AR": "A", "2AS": "X", "2AT": "T", "2AU": "U", + "2BD": "I", "2BT": "T", "2BU": "A", "2CO": "C", "2DA": "A", + "2DF": "N", "2DM": "N", "2DO": "X", "2DT": "T", "2EG": "G", + "2FE": "N", "2FI": "N", "2FM": "M", "2GT": "T", "2HF": "H", + "2LU": "L", "2MA": "A", "2MG": "G", "2ML": "L", "2MR": "R", + "2MT": "P", "2MU": "U", "2NT": "T", "2OM": "U", "2OT": "T", + "2PI": "X", "2PR": "G", "2SA": "N", "2SI": "X", "2ST": "T", + "2TL": "T", "2TY": "Y", "2VA": "V", "2XA": "C", "32S": "X", + "32T": "X", "3AH": "H", "3AR": "X", "3CF": "F", "3DA": "A", + "3DR": "N", "3GA": "A", "3MD": "D", "3ME": "U", "3NF": "Y", + "3QN": "K", "3TY": "X", "3XH": "G", "4AC": "N", "4BF": "Y", + "4CF": "F", "4CY": "M", "4DP": "W", "4F3": "GYG", "4FB": "P", + "4FW": "W", "4HT": "W", "4IN": "W", "4MF": "N", "4MM": "X", + "4OC": "C", "4PC": "C", "4PD": "C", "4PE": "C", "4PH": "F", + "4SC": "C", "4SU": "U", "4TA": "N", "4U7": "A", "56A": "H", + "5AA": "A", "5AB": "A", "5AT": "T", "5BU": "U", "5CG": "G", + "5CM": "C", "5CS": "C", "5FA": "A", "5FC": "C", "5FU": "U", + "5HP": "E", "5HT": "T", "5HU": "U", "5IC": "C", "5IT": "T", + "5IU": "U", "5MC": "C", "5MD": "N", "5MU": "U", "5NC": "C", + "5PC": "C", "5PY": "T", "5SE": "U", "5ZA": "TWG", "64T": "T", + "6CL": "K", "6CT": "T", "6CW": "W", "6HA": "A", "6HC": "C", + "6HG": "G", "6HN": "K", "6HT": "T", "6IA": "A", "6MA": "A", + "6MC": "A", "6MI": "N", "6MT": "A", "6MZ": "N", "6OG": "G", + "70U": "U", "7DA": "A", "7GU": "G", "7JA": "I", "7MG": "G", + "8AN": "A", "8FG": "G", "8MG": "G", "8OG": "G", "9NE": "E", + "9NF": "F", "9NR": "R", "9NV": "V", "A ": "A", "A1P": "N", + "A23": "A", "A2L": "A", "A2M": "A", "A34": "A", "A35": "A", + "A38": "A", "A39": "A", "A3A": "A", "A3P": "A", "A40": "A", + "A43": "A", "A44": "A", "A47": "A", "A5L": "A", "A5M": "C", + "A5N": "N", "A5O": "A", "A66": "X", "AA3": "A", "AA4": "A", + "AAR": "R", "AB7": "X", "ABA": "A", "ABR": "A", "ABS": "A", + "ABT": "N", "ACB": "D", "ACL": "R", "AD2": "A", "ADD": "X", + "ADX": "N", "AEA": "X", "AEI": "D", "AET": "A", "AFA": "N", + "AFF": "N", "AFG": "G", "AGM": "R", "AGT": "C", "AHB": "N", + "AHH": "X", "AHO": "A", "AHP": "A", "AHS": "X", "AHT": "X", + "AIB": "A", "AKL": "D", "AKZ": "D", "ALA": "A", "ALC": "A", + "ALM": "A", "ALN": "A", "ALO": "T", "ALQ": "X", "ALS": "A", + "ALT": "A", "ALV": "A", "ALY": "K", "AN8": "A", "AP7": "A", + "APE": "X", "APH": "A", "API": "K", "APK": "K", "APM": "X", + "APP": "X", "AR2": "R", "AR4": "E", "AR7": "R", "ARG": "R", + "ARM": "R", "ARO": "R", "ARV": "X", "AS ": "A", "AS2": "D", + "AS9": "X", "ASA": "D", "ASB": "D", "ASI": "D", "ASK": "D", + "ASL": "D", "ASM": "X", "ASN": "N", "ASP": "D", "ASQ": "D", + "ASU": "N", "ASX": "B", "ATD": "T", "ATL": "T", "ATM": "T", + "AVC": "A", "AVN": "X", "AYA": "A", "AYG": "AYG", "AZK": "K", + "AZS": "S", "AZY": "Y", "B1F": "F", "B1P": "N", "B2A": "A", + "B2F": "F", "B2I": "I", "B2V": "V", "B3A": "A", "B3D": "D", + "B3E": "E", "B3K": "K", "B3L": "X", "B3M": "X", "B3Q": "X", + "B3S": "S", "B3T": "X", "B3U": "H", "B3X": "N", "B3Y": "Y", + "BB6": "C", "BB7": "C", "BB8": "F", "BB9": "C", "BBC": "C", + "BCS": "C", "BE2": "X", "BFD": "D", "BG1": "S", "BGM": "G", + "BH2": "D", "BHD": "D", "BIF": "F", "BIL": "X", "BIU": "I", + "BJH": "X", "BLE": "L", "BLY": "K", "BMP": "N", "BMT": "T", + "BNN": "F", "BNO": "X", "BOE": "T", "BOR": "R", "BPE": "C", + "BRU": "U", "BSE": "S", "BT5": "N", "BTA": "L", "BTC": "C", + "BTR": "W", "BUC": "C", "BUG": "V", "BVP": "U", "BZG": "N", + "C ": "C", "C12": "TYG", "C1X": "K", "C25": "C", "C2L": "C", + "C2S": "C", "C31": "C", "C32": "C", "C34": "C", "C36": "C", + "C37": "C", "C38": "C", "C3Y": "C", "C42": "C", "C43": "C", + "C45": "C", "C46": "C", "C49": "C", "C4R": "C", "C4S": "C", + "C5C": "C", "C66": "X", "C6C": "C", "C99": "TFG", "CAF": "C", + "CAL": "X", "CAR": "C", "CAS": "C", "CAV": "X", "CAY": "C", + "CB2": "C", "CBR": "C", "CBV": "C", "CCC": "C", "CCL": "K", + "CCS": "C", "CCY": "CYG", "CDE": "X", "CDV": "X", "CDW": "C", + "CEA": "C", "CFL": "C", "CFY": "FCYG", "CG1": "G", "CGA": "E", + "CGU": "E", "CH ": "C", "CH6": "MYG", "CH7": "KYG", "CHF": "X", + "CHG": "X", "CHP": "G", "CHS": "X", "CIR": "R", "CJO": "GYG", + "CLE": "L", "CLG": "K", "CLH": "K", "CLV": "AFG", "CM0": "N", + "CME": "C", "CMH": "C", "CML": "C", "CMR": "C", "CMT": "C", + "CNU": "U", "CP1": "C", "CPC": "X", "CPI": "X", "CQR": "GYG", + "CR0": "TLG", "CR2": "GYG", "CR5": "G", "CR7": "KYG", "CR8": "HYG", + "CRF": "TWG", "CRG": "THG", "CRK": "MYG", "CRO": "GYG", "CRQ": "QYG", + "CRU": "EYG", "CRW": "ASG", "CRX": "ASG", "CS0": "C", "CS1": "C", + "CS3": "C", "CS4": "C", "CS8": "N", "CSA": "C", "CSB": "C", + "CSD": "C", "CSE": "C", "CSF": "C", "CSH": "SHG", "CSI": "G", + "CSJ": "C", "CSL": "C", "CSO": "C", "CSP": "C", "CSR": "C", + "CSS": "C", "CSU": "C", "CSW": "C", "CSX": "C", "CSY": "SYG", + "CSZ": "C", "CTE": "W", "CTG": "T", "CTH": "T", "CUC": "X", + "CWR": "S", "CXM": "M", "CY0": "C", "CY1": "C", "CY3": "C", + "CY4": "C", "CYA": "C", "CYD": "C", "CYF": "C", "CYG": "C", + "CYJ": "X", "CYM": "C", "CYQ": "C", "CYR": "C", "CYS": "C", + "CZ2": "C", "CZO": "GYG", "CZZ": "C", "D11": "T", "D1P": "N", + "D3 ": "N", "D33": "N", "D3P": "G", "D3T": "T", "D4M": "T", + "D4P": "X", "DA ": "A", "DA2": "X", "DAB": "A", "DAH": "F", + "DAL": "A", "DAR": "R", "DAS": "D", "DBB": "T", "DBM": "N", + "DBS": "S", "DBU": "T", "DBY": "Y", "DBZ": "A", "DC ": "C", + "DC2": "C", "DCG": "G", "DCI": "X", "DCL": "X", "DCT": "C", + "DCY": "C", "DDE": "H", "DDG": "G", "DDN": "U", "DDX": "N", + "DFC": "C", "DFG": "G", "DFI": "X", "DFO": "X", "DFT": "N", + "DG ": "G", "DGH": "G", "DGI": "G", "DGL": "E", "DGN": "Q", + "DHA": "S", "DHI": "H", "DHL": "X", "DHN": "V", "DHP": "X", + "DHU": "U", "DHV": "V", "DI ": "I", "DIL": "I", "DIR": "R", + "DIV": "V", "DLE": "L", "DLS": "K", "DLY": "K", "DM0": "K", + "DMH": "N", "DMK": "D", "DMT": "X", "DN ": "N", "DNE": "L", + "DNG": "L", "DNL": "K", "DNM": "L", "DNP": "A", "DNR": "C", + "DNS": "K", "DOA": "X", "DOC": "C", "DOH": "D", "DON": "L", + "DPB": "T", "DPH": "F", "DPL": "P", "DPP": "A", "DPQ": "Y", + "DPR": "P", "DPY": "N", "DRM": "U", "DRP": "N", "DRT": "T", + "DRZ": "N", "DSE": "S", "DSG": "N", "DSN": "S", "DSP": "D", + "DT ": "T", "DTH": "T", "DTR": "W", "DTY": "Y", "DU ": "U", + "DVA": "V", "DXD": "N", "DXN": "N", "DYG": "DYG", "DYS": "C", + "DZM": "A", "E ": "A", "E1X": "A", "ECC": "Q", "EDA": "A", + "EFC": "C", "EHP": "F", "EIT": "T", "ENP": "N", "ESB": "Y", + "ESC": "M", "EXB": "X", "EXY": "L", "EY5": "N", "EYS": "X", + "F2F": "F", "FA2": "A", "FA5": "N", "FAG": "N", "FAI": "N", + "FB5": "A", "FB6": "A", "FCL": "F", "FFD": "N", "FGA": "E", + "FGL": "G", "FGP": "S", "FHL": "X", "FHO": "K", "FHU": "U", + "FLA": "A", "FLE": "L", "FLT": "Y", "FME": "M", "FMG": "G", + "FMU": "N", "FOE": "C", "FOX": "G", "FP9": "P", "FPA": "F", + "FRD": "X", "FT6": "W", "FTR": "W", "FTY": "Y", "FVA": "V", + "FZN": "K", "G ": "G", "G25": "G", "G2L": "G", "G2S": "G", + "G31": "G", "G32": "G", "G33": "G", "G36": "G", "G38": "G", + "G42": "G", "G46": "G", "G47": "G", "G48": "G", "G49": "G", + "G4P": "N", "G7M": "G", "GAO": "G", "GAU": "E", "GCK": "C", + "GCM": "X", "GDP": "G", "GDR": "G", "GFL": "G", "GGL": "E", + "GH3": "G", "GHG": "Q", "GHP": "G", "GL3": "G", "GLH": "Q", + "GLJ": "E", "GLK": "E", "GLM": "X", "GLN": "Q", "GLQ": "E", + "GLU": "E", "GLX": "Z", "GLY": "G", "GLZ": "G", "GMA": "E", + "GMS": "G", "GMU": "U", "GN7": "G", "GND": "X", "GNE": "N", + "GOM": "G", "GPL": "K", "GS ": "G", "GSC": "G", "GSR": "G", + "GSS": "G", "GSU": "E", "GT9": "C", "GTP": "G", "GVL": "X", + "GYC": "CYG", "GYS": "SYG", "H2U": "U", "H5M": "P", "HAC": "A", + "HAR": "R", "HBN": "H", "HCS": "X", "HDP": "U", "HEU": "U", + "HFA": "X", "HGL": "X", "HHI": "H", "HHK": "AK", "HIA": "H", + "HIC": "H", "HIP": "H", "HIQ": "H", "HIS": "H", "HL2": "L", + "HLU": "L", "HMR": "R", "HOL": "N", "HPC": "F", "HPE": "F", + "HPH": "F", "HPQ": "F", "HQA": "A", "HRG": "R", "HRP": "W", + "HS8": "H", "HS9": "H", "HSE": "S", "HSL": "S", "HSO": "H", + "HTI": "C", "HTN": "N", "HTR": "W", "HV5": "A", "HVA": "V", + "HY3": "P", "HYP": "P", "HZP": "P", "I ": "I", "I2M": "I", + "I58": "K", "I5C": "C", "IAM": "A", "IAR": "R", "IAS": "D", + "IC ": "C", "IEL": "K", "IEY": "HYG", "IG ": "G", "IGL": "G", + "IGU": "G", "IIC": "SHG", "IIL": "I", "ILE": "I", "ILG": "E", + "ILX": "I", "IMC": "C", "IML": "I", "IOY": "F", "IPG": "G", + "IPN": "N", "IRN": "N", "IT1": "K", "IU ": "U", "IYR": "Y", + "IYT": "T", "IZO": "M", "JJJ": "C", "JJK": "C", "JJL": "C", + "JW5": "N", "K1R": "C", "KAG": "G", "KCX": "K", "KGC": "K", + "KNB": "A", "KOR": "M", "KPI": "K", "KST": "K", "KYQ": "K", + "L2A": "X", "LA2": "K", "LAA": "D", "LAL": "A", "LBY": "K", + "LC ": "C", "LCA": "A", "LCC": "N", "LCG": "G", "LCH": "N", + "LCK": "K", "LCX": "K", "LDH": "K", "LED": "L", "LEF": "L", + "LEH": "L", "LEI": "V", "LEM": "L", "LEN": "L", "LET": "X", + "LEU": "L", "LEX": "L", "LG ": "G", "LGP": "G", "LHC": "X", + "LHU": "U", "LKC": "N", "LLP": "K", "LLY": "K", "LME": "E", + "LMF": "K", "LMQ": "Q", "LMS": "N", "LP6": "K", "LPD": "P", + "LPG": "G", "LPL": "X", "LPS": "S", "LSO": "X", "LTA": "X", + "LTR": "W", "LVG": "G", "LVN": "V", "LYF": "K", "LYK": "K", + "LYM": "K", "LYN": "K", "LYR": "K", "LYS": "K", "LYX": "K", + "LYZ": "K", "M0H": "C", "M1G": "G", "M2G": "G", "M2L": "K", + "M2S": "M", "M30": "G", "M3L": "K", "M5M": "C", "MA ": "A", + "MA6": "A", "MA7": "A", "MAA": "A", "MAD": "A", "MAI": "R", + "MBQ": "Y", "MBZ": "N", "MC1": "S", "MCG": "X", "MCL": "K", + "MCS": "C", "MCY": "C", "MD3": "C", "MD6": "G", "MDH": "X", + "MDO": "ASG", "MDR": "N", "MEA": "F", "MED": "M", "MEG": "E", + "MEN": "N", "MEP": "U", "MEQ": "Q", "MET": "M", "MEU": "G", + "MF3": "X", "MFC": "GYG", "MG1": "G", "MGG": "R", "MGN": "Q", + "MGQ": "A", "MGV": "G", "MGY": "G", "MHL": "L", "MHO": "M", + "MHS": "H", "MIA": "A", "MIS": "S", "MK8": "L", "ML3": "K", + "MLE": "L", "MLL": "L", "MLY": "K", "MLZ": "K", "MME": "M", + "MMO": "R", "MMT": "T", "MND": "N", "MNL": "L", "MNU": "U", + "MNV": "V", "MOD": "X", "MP8": "P", "MPH": "X", "MPJ": "X", + "MPQ": "G", "MRG": "G", "MSA": "G", "MSE": "M", "MSL": "M", + "MSO": "M", "MSP": "X", "MT2": "M", "MTR": "T", "MTU": "A", + "MTY": "Y", "MVA": "V", "N ": "N", "N10": "S", "N2C": "X", + "N5I": "N", "N5M": "C", "N6G": "G", "N7P": "P", "NA8": "A", + "NAL": "A", "NAM": "A", "NB8": "N", "NBQ": "Y", "NC1": "S", + "NCB": "A", "NCX": "N", "NCY": "X", "NDF": "F", "NDN": "U", + "NEM": "H", "NEP": "H", "NF2": "N", "NFA": "F", "NHL": "E", + "NIT": "X", "NIY": "Y", "NLE": "L", "NLN": "L", "NLO": "L", + "NLP": "L", "NLQ": "Q", "NMC": "G", "NMM": "R", "NMS": "T", + "NMT": "T", "NNH": "R", "NP3": "N", "NPH": "C", "NPI": "A", + "NRP": "LYG", "NRQ": "MYG", "NSK": "X", "NTY": "Y", "NVA": "V", + "NYC": "TWG", "NYG": "NYG", "NYM": "N", "NYS": "C", "NZH": "H", + "O12": "X", "O2C": "N", "O2G": "G", "OAD": "N", "OAS": "S", + "OBF": "X", "OBS": "X", "OCS": "C", "OCY": "C", "ODP": "N", + "OHI": "H", "OHS": "D", "OIC": "X", "OIP": "I", "OLE": "X", + "OLT": "T", "OLZ": "S", "OMC": "C", "OMG": "G", "OMT": "M", + "OMU": "U", "ONE": "U", "ONH": "A", "ONL": "X", "OPR": "R", + "ORN": "A", "ORQ": "R", "OSE": "S", "OTB": "X", "OTH": "T", + "OTY": "Y", "OXX": "D", "P ": "G", "P1L": "C", "P1P": "N", + "P2T": "T", "P2U": "U", "P2Y": "P", "P5P": "A", "PAQ": "Y", + "PAS": "D", "PAT": "W", "PAU": "A", "PBB": "C", "PBF": "F", + "PBT": "N", "PCA": "E", "PCC": "P", "PCE": "X", "PCS": "F", + "PDL": "X", "PDU": "U", "PEC": "C", "PF5": "F", "PFF": "F", + "PFX": "X", "PG1": "S", "PG7": "G", "PG9": "G", "PGL": "X", + "PGN": "G", "PGP": "G", "PGY": "G", "PHA": "F", "PHD": "D", + "PHE": "F", "PHI": "F", "PHL": "F", "PHM": "F", "PIA": "AYG", + "PIV": "X", "PLE": "L", "PM3": "F", "PMT": "C", "POM": "P", + "PPN": "F", "PPU": "A", "PPW": "G", "PQ1": "N", "PR3": "C", + "PR5": "A", "PR9": "P", "PRN": "A", "PRO": "P", "PRS": "P", + "PSA": "F", "PSH": "H", "PST": "T", "PSU": "U", "PSW": "C", + "PTA": "X", "PTH": "Y", "PTM": "Y", "PTR": "Y", "PU ": "A", + "PUY": "N", "PVH": "H", "PVL": "X", "PYA": "A", "PYO": "U", + "PYX": "C", "PYY": "N", "QLG": "QLG", "QMM": "Q", "QPA": "C", + "QPH": "F", "QUO": "G", "R ": "A", "R1A": "C", "R4K": "W", + "RC7": "HYG", "RE0": "W", "RE3": "W", "RIA": "A", "RMP": "A", + "RON": "X", "RT ": "T", "RTP": "N", "S1H": "S", "S2C": "C", + "S2D": "A", "S2M": "T", "S2P": "A", "S4A": "A", "S4C": "C", + "S4G": "G", "S4U": "U", "S6G": "G", "SAC": "S", "SAH": "C", + "SAR": "G", "SBL": "S", "SC ": "C", "SCH": "C", "SCS": "C", + "SCY": "C", "SD2": "X", "SDG": "G", "SDP": "S", "SEB": "S", + "SEC": "A", "SEG": "A", "SEL": "S", "SEM": "S", "SEN": "S", + "SEP": "S", "SER": "S", "SET": "S", "SGB": "S", "SHC": "C", + "SHP": "G", "SHR": "K", "SIB": "C", "SIC": "DC", "SLA": "P", + "SLR": "P", "SLZ": "K", "SMC": "C", "SME": "M", "SMF": "F", + "SMP": "A", "SMT": "T", "SNC": "C", "SNN": "N", "SOC": "C", + "SOS": "N", "SOY": "S", "SPT": "T", "SRA": "A", "SSU": "U", + "STY": "Y", "SUB": "X", "SUI": "DG", "SUN": "S", "SUR": "U", + "SVA": "S", "SVV": "S", "SVW": "S", "SVX": "S", "SVY": "S", + "SVZ": "X", "SWG": "SWG", "SYS": "C", "T ": "T", "T11": "F", + "T23": "T", "T2S": "T", "T2T": "N", "T31": "U", "T32": "T", + "T36": "T", "T37": "T", "T38": "T", "T39": "T", "T3P": "T", + "T41": "T", "T48": "T", "T49": "T", "T4S": "T", "T5O": "U", + "T5S": "T", "T66": "X", "T6A": "A", "TA3": "T", "TA4": "X", + "TAF": "T", "TAL": "N", "TAV": "D", "TBG": "V", "TBM": "T", + "TC1": "C", "TCP": "T", "TCQ": "Y", "TCR": "W", "TCY": "A", + "TDD": "L", "TDY": "T", "TFE": "T", "TFO": "A", "TFQ": "F", + "TFT": "T", "TGP": "G", "TH6": "T", "THC": "T", "THO": "X", + "THR": "T", "THX": "N", "THZ": "R", "TIH": "A", "TLB": "N", + "TLC": "T", "TLN": "U", "TMB": "T", "TMD": "T", "TNB": "C", + "TNR": "S", "TOX": "W", "TP1": "T", "TPC": "C", "TPG": "G", + "TPH": "X", "TPL": "W", "TPO": "T", "TPQ": "Y", "TQI": "W", + "TQQ": "W", "TRF": "W", "TRG": "K", "TRN": "W", "TRO": "W", + "TRP": "W", "TRQ": "W", "TRW": "W", "TRX": "W", "TS ": "N", + "TST": "X", "TT ": "N", "TTD": "T", "TTI": "U", "TTM": "T", + "TTQ": "W", "TTS": "Y", "TY1": "Y", "TY2": "Y", "TY3": "Y", + "TY5": "Y", "TYB": "Y", "TYI": "Y", "TYJ": "Y", "TYN": "Y", + "TYO": "Y", "TYQ": "Y", "TYR": "Y", "TYS": "Y", "TYT": "Y", + "TYU": "N", "TYW": "Y", "TYX": "X", "TYY": "Y", "TZB": "X", + "TZO": "X", "U ": "U", "U25": "U", "U2L": "U", "U2N": "U", + "U2P": "U", "U31": "U", "U33": "U", "U34": "U", "U36": "U", + "U37": "U", "U8U": "U", "UAR": "U", "UCL": "U", "UD5": "U", + "UDP": "N", "UFP": "N", "UFR": "U", "UFT": "U", "UMA": "A", + "UMP": "U", "UMS": "U", "UN1": "X", "UN2": "X", "UNK": "X", + "UR3": "U", "URD": "U", "US1": "U", "US2": "U", "US3": "T", + "US5": "U", "USM": "U", "VAD": "V", "VAF": "V", "VAL": "V", + "VB1": "K", "VDL": "X", "VLL": "X", "VLM": "X", "VMS": "X", + "VOL": "X", "WCR": "GYG", "X ": "G", "X2W": "E", "X4A": "N", + "X9Q": "AFG", "XAD": "A", "XAE": "N", "XAL": "A", "XAR": "N", + "XCL": "C", "XCN": "C", "XCP": "X", "XCR": "C", "XCS": "N", + "XCT": "C", "XCY": "C", "XGA": "N", "XGL": "G", "XGR": "G", + "XGU": "G", "XPR": "P", "XSN": "N", "XTH": "T", "XTL": "T", + "XTR": "T", "XTS": "G", "XTY": "N", "XUA": "A", "XUG": "G", + "XX1": "K", "XXY": "THG", "XYG": "DYG", "Y ": "A", "YCM": "C", + "YG ": "G", "YOF": "Y", "YRR": "N", "YYG": "G", "Z ": "C", + "Z01": "A", "ZAD": "A", "ZAL": "A", "ZBC": "C", "ZBU": "U", + "ZCL": "F", "ZCY": "C", "ZDU": "U", "ZFB": "X", "ZGU": "G", + "ZHP": "N", "ZTH": "T", "ZU0": "T", "ZZJ": "A"} diff --git a/code/lib/Bio/Data/__init__.py b/code/lib/Bio/Data/__init__.py new file mode 100644 index 0000000..568286c --- /dev/null +++ b/code/lib/Bio/Data/__init__.py @@ -0,0 +1,8 @@ +# Copyright 2000 Andrew Dalke. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Collections of various bits of useful biological data.""" diff --git a/code/lib/Bio/Data/__pycache__/CodonTable.cpython-37.pyc b/code/lib/Bio/Data/__pycache__/CodonTable.cpython-37.pyc new file mode 100644 index 0000000..7566625 Binary files /dev/null and b/code/lib/Bio/Data/__pycache__/CodonTable.cpython-37.pyc differ diff --git a/code/lib/Bio/Data/__pycache__/IUPACData.cpython-37.pyc b/code/lib/Bio/Data/__pycache__/IUPACData.cpython-37.pyc new file mode 100644 index 0000000..f47d34a Binary files /dev/null and b/code/lib/Bio/Data/__pycache__/IUPACData.cpython-37.pyc differ diff --git a/code/lib/Bio/Data/__pycache__/SCOPData.cpython-37.pyc b/code/lib/Bio/Data/__pycache__/SCOPData.cpython-37.pyc new file mode 100644 index 0000000..ce87c43 Binary files /dev/null and b/code/lib/Bio/Data/__pycache__/SCOPData.cpython-37.pyc differ diff --git a/code/lib/Bio/Data/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Data/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..1ff1640 Binary files /dev/null and b/code/lib/Bio/Data/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Emboss/Applications.py b/code/lib/Bio/Emboss/Applications.py new file mode 100644 index 0000000..c3eab72 --- /dev/null +++ b/code/lib/Bio/Emboss/Applications.py @@ -0,0 +1,1221 @@ +# Copyright 2001-2009 Brad Chapman. +# Revisions copyright 2009-2016 by Peter Cock. +# Revisions copyright 2009 by David Winter. +# Revisions copyright 2009-2010 by Leighton Pritchard. +# All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Code to interact with and run various EMBOSS programs (OBSOLETE). + +These classes follow the AbstractCommandline interfaces for running +programs. + +We have decided to remove this module in future, and instead recommend +building your command and invoking it via the subprocess module directly. +""" + + +from Bio.Application import _Option, _Switch, AbstractCommandline + + +class _EmbossMinimalCommandLine(AbstractCommandline): + """Base Commandline object for EMBOSS wrappers (PRIVATE). + + This is provided for subclassing, it deals with shared options + common to all the EMBOSS tools: + + Attributes: + - auto Turn off prompts + - stdout Write standard output + - filter Read standard input, write standard output + - options Prompt for standard and additional values + - debug Write debug output to program.dbg + - verbose Report some/full command line options + - help Report command line options. More + information on associated and general + qualifiers can be found with -help -verbose + - warning Report warnings + - error Report errors + - fatal Report fatal errors + - die Report dying program messages + + """ + + def __init__(self, cmd=None, **kwargs): + assert cmd is not None + extra_parameters = [ + _Switch( + ["-auto", "auto"], + "Turn off prompts.\n\n" + "Automatic mode disables prompting, so we recommend you set this " + "argument all the time when calling an EMBOSS tool from Biopython.", + ), + _Switch(["-stdout", "stdout"], "Write standard output."), + _Switch( + ["-filter", "filter"], "Read standard input, write standard output." + ), + _Switch( + ["-options", "options"], + "Prompt for standard and additional values.\n\n" + "If you are calling an EMBOSS tool from within Biopython, " + "we DO NOT recommend using this option.", + ), + _Switch(["-debug", "debug"], "Write debug output to program.dbg."), + _Switch(["-verbose", "verbose"], "Report some/full command line options"), + _Switch( + ["-help", "help"], + "Report command line options.\n\n" + "More information on associated and general qualifiers " + "can be found with -help -verbose", + ), + _Switch(["-warning", "warning"], "Report warnings."), + _Switch(["-error", "error"], "Report errors."), + _Switch(["-die", "die"], "Report dying program messages."), + ] + try: + # Insert extra parameters - at the start just in case there + # are any arguments which must come last: + self.parameters = extra_parameters + self.parameters + except AttributeError: + # Should we raise an error? The subclass should have set this up! + self.parameters = extra_parameters + AbstractCommandline.__init__(self, cmd, **kwargs) + + +class _EmbossCommandLine(_EmbossMinimalCommandLine): + """Base Commandline object for EMBOSS wrappers (PRIVATE). + + This is provided for subclassing, it deals with shared options + common to all the EMBOSS tools plus: + + - outfile Output filename + + """ + + def __init__(self, cmd=None, **kwargs): + assert cmd is not None + extra_parameters = [ + _Option(["-outfile", "outfile"], "Output filename", filename=True) + ] + try: + # Insert extra parameters - at the start just in case there + # are any arguments which must come last: + self.parameters = extra_parameters + self.parameters + except AttributeError: + # Should we raise an error? The subclass should have set this up! + self.parameters = extra_parameters + _EmbossMinimalCommandLine.__init__(self, cmd, **kwargs) + + def _validate(self): + # Check the outfile, filter, or stdout option has been set. + # We can't simply do this via the required flag for the outfile + # output - this seems the simplest solution. + if not (self.outfile or self.filter or self.stdout): + raise ValueError( + "You must either set outfile (output filename), " + "or enable filter or stdout (output to stdout)." + ) + return _EmbossMinimalCommandLine._validate(self) + + +class Primer3Commandline(_EmbossCommandLine): + """Commandline object for the Primer3 interface from EMBOSS. + + The precise set of supported arguments depends on your version of EMBOSS. + This version accepts arguments current at EMBOSS 6.1.0: + + >>> cline = Primer3Commandline(sequence="mysequence.fas", auto=True, hybridprobe=True) + >>> cline.explainflag = True + >>> cline.osizeopt=20 + >>> cline.psizeopt=200 + >>> cline.outfile = "myresults.out" + >>> cline.bogusparameter = 1967 # Invalid parameter + Traceback (most recent call last): + ... + ValueError: Option name bogusparameter was not found. + >>> print(cline) + eprimer3 -auto -outfile=myresults.out -sequence=mysequence.fas -hybridprobe=True -psizeopt=200 -osizeopt=20 -explainflag=True + + """ + + def __init__(self, cmd="eprimer3", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], + "Sequence to choose primers from.", + is_required=True, + ), + _Option(["-task", "task"], "Tell eprimer3 what task to perform."), + _Option( + ["-hybridprobe", "hybridprobe"], + "Find an internal oligo to use as a hyb probe.", + ), + _Option( + ["-numreturn", "numreturn"], "Maximum number of primer pairs to return." + ), + _Option( + ["-includedregion", "includedregion"], + "Subregion of the sequence in which to pick primers.", + ), + _Option(["-target", "target"], "Sequence to target for flanking primers."), + _Option( + ["-excludedregion", "excludedregion"], + "Regions to exclude from primer picking.", + ), + _Option( + ["-forwardinput", "forwardinput"], + "Sequence of a forward primer to check.", + ), + _Option( + ["-reverseinput", "reverseinput"], + "Sequence of a reverse primer to check.", + ), + _Option( + ["-gcclamp", "gcclamp"], + "The required number of Gs and Cs at the 3' of each primer.", + ), + _Option(["-osize", "osize"], "Optimum length of a primer oligo."), + _Option(["-minsize", "minsize"], "Minimum length of a primer oligo."), + _Option(["-maxsize", "maxsize"], "Maximum length of a primer oligo."), + _Option( + ["-otm", "otm"], + "Melting temperature for primer oligo (OBSOLETE).\n\n" + "Option replaced in EMBOSS 6.6.0 by -opttm", + ), + _Option( + ["-opttm", "opttm"], + "Optimum melting temperature for a primer oligo.\n\n" + "Option added in EMBOSS 6.6.0, replacing -otm", + ), + _Option( + ["-mintm", "mintm"], "Minimum melting temperature for a primer oligo." + ), + _Option( + ["-maxtm", "maxtm"], "Maximum melting temperature for a primer oligo." + ), + _Option( + ["-maxdifftm", "maxdifftm"], + "Maximum difference in melting temperatures between " + "forward and reverse primers.", + ), + _Option(["-ogcpercent", "ogcpercent"], "Optimum GC% for a primer."), + _Option(["-mingc", "mingc"], "Minimum GC% for a primer."), + _Option(["-maxgc", "maxgc"], "Maximum GC% for a primer."), + _Option( + ["-saltconc", "saltconc"], "Millimolar salt concentration in the PCR." + ), + _Option( + ["-dnaconc", "dnaconc"], + "Nanomolar concentration of annealing oligos in the PCR.", + ), + _Option( + ["-maxpolyx", "maxpolyx"], + "Maximum allowable mononucleotide repeat length in a primer.", + ), + # Primer length: + _Option(["-psizeopt", "psizeopt"], "Optimum size for the PCR product."), + _Option( + ["-prange", "prange"], "Acceptable range of length for the PCR product." + ), + # Primer temperature: + _Option( + ["-ptmopt", "ptmopt"], + "Optimum melting temperature for the PCR product.", + ), + _Option( + ["-ptmmin", "ptmmin"], + "Minimum allowed melting temperature for the amplicon.", + ), + _Option( + ["-ptmmax", "ptmmax"], + "Maximum allowed melting temperature for the amplicon.", + ), + # Note to self, should be -oexcludedregion not -oexcluderegion + _Option( + ["-oexcludedregion", "oexcludedregion"], + "Do not pick internal oligos in this region.", + ), + _Option(["-oligoinput", "oligoinput"], "Sequence of the internal oligo."), + # Oligo length: + _Option(["-osizeopt", "osizeopt"], "Optimum length of internal oligo."), + _Option(["-ominsize", "ominsize"], "Minimum length of internal oligo."), + _Option(["-omaxsize", "omaxsize"], "Maximum length of internal oligo."), + # Oligo GC temperature: + _Option( + ["-otmopt", "otmopt"], "Optimum melting temperature of internal oligo." + ), + _Option( + ["-otmmin", "otmmin"], "Minimum melting temperature of internal oligo." + ), + _Option( + ["-otmmax", "otmmax"], "Maximum melting temperature of internal oligo." + ), + # Oligo GC percent: + _Option(["-ogcopt", "ogcopt"], "Optimum GC% for internal oligo."), + _Option(["-ogcmin", "ogcmin"], "Minimum GC% for internal oligo."), + _Option(["-ogcmax", "ogcmax"], "Maximum GC% for internal oligo."), + # Oligo salt concentration: + _Option( + ["-osaltconc", "osaltconc"], + "Millimolar concentration of salt in the hybridisation.", + ), + _Option( + ["-odnaconc", "odnaconc"], + "Nanomolar concentration of internal oligo in the hybridisation.", + ), + # Oligo self complementarity + _Option( + ["-oanyself", "oanyself"], + "Maximum allowable alignment score for self-complementarity.", + ), + _Option( + ["-oendself", "oendself"], + "Max 3'-anchored self-complementarity global alignment score.", + ), + _Option( + ["-opolyxmax", "opolyxmax"], + "Maximum length of mononucleotide repeat in internal oligo.", + ), + _Option( + ["-mispriminglibraryfile", "mispriminglibraryfile"], + "File containing library of sequences to avoid amplifying", + ), + _Option( + ["-maxmispriming", "maxmispriming"], + "Maximum allowed similarity of primers to sequences in " + "library specified by -mispriminglibrary", + ), + _Option( + ["-omishybmax", "omishybmax"], + "Maximum alignment score for hybridisation of internal oligo to " + "library specified by -mishyblibraryfile.", + ), + _Option( + ["-mishyblibraryfile", "mishyblibraryfile"], + "Library file of seqs to avoid internal oligo hybridisation.", + ), + _Option( + ["-explainflag", "explainflag"], + "Produce output tags with eprimer3 statistics", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class PrimerSearchCommandline(_EmbossCommandLine): + """Commandline object for the primersearch program from EMBOSS.""" + + def __init__(self, cmd="primersearch", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-seqall", "-sequences", "sequences", "seqall"], + "Sequence to look for the primer pairs in.", + is_required=True, + ), + # When this wrapper was written primersearch used -sequences + # as the argument name. Since at least EMBOSS 5.0 (and + # perhaps earlier) this has been -seqall instead. + _Option( + ["-infile", "-primers", "primers", "infile"], + "File containing the primer pairs to search for.", + filename=True, + is_required=True, + ), + # When this wrapper was written primersearch used -primers + # as the argument name. Since at least EMBOSS 5.0 (and + # perhaps earlier) this has been -infile instead. + _Option( + ["-mismatchpercent", "mismatchpercent"], + "Allowed percentage mismatch (any integer value, default 0).", + is_required=True, + ), + _Option( + ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" + ), + _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FDNADistCommandline(_EmbossCommandLine): + """Commandline object for the fdnadist program from EMBOSS. + + fdnadist is an EMBOSS wrapper for the PHYLIP program dnadist for + calulating distance matrices from DNA sequence files. + """ + + def __init__(self, cmd="fdnadist", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], + "seq file to use (phylip)", + filename=True, + is_required=True, + ), + _Option(["-method", "method"], "sub. model [f,k,j,l,s]", is_required=True), + _Option(["-gamma", "gamma"], "gamma [g, i,n]"), + _Option( + ["-ncategories", "ncategories"], "number of rate catergories (1-9)" + ), + _Option(["-rate", "rate"], "rate for each category"), + _Option( + ["-categories", "categories"], "File of substitution rate categories" + ), + _Option(["-weights", "weights"], "weights file"), + _Option( + ["-gammacoefficient", "gammacoefficient"], "value for gamma (> 0.001)" + ), + _Option(["-invarfrac", "invarfrac"], "proportoin of invariant sites"), + _Option(["-ttratio", "ttratio"], "ts/tv ratio"), + _Option(["-freqsfrom", "freqsfrom"], "use emprical base freqs"), + _Option(["-basefreq", "basefreq"], "specify basefreqs"), + _Option(["-lower", "lower"], "lower triangle matrix (y/N)"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FTreeDistCommandline(_EmbossCommandLine): + """Commandline object for the ftreedist program from EMBOSS. + + ftreedist is an EMBOSS wrapper for the PHYLIP program treedist used for + calulating distance measures between phylogentic trees. + """ + + def __init__(self, cmd="ftreedist", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-intreefile", "intreefile"], + "tree file to score (phylip)", + filename=True, + is_required=True, + ), + _Option(["-dtype", "dtype"], "distance type ([S]ymetric, [b]ranch score)"), + _Option( + ["-pairing", "pairing"], + "tree pairing method ([A]djacent pairs, all [p]ossible pairs)", + ), + _Option(["-style", "style"], "output style - [V]erbose, [f]ill, [s]parse"), + _Option(["-noroot", "noroot"], "treat trees as rooted [N/y]"), + _Option( + ["-outgrno", "outgrno"], + "which taxon to root the trees with (starts from 0)", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FNeighborCommandline(_EmbossCommandLine): + """Commandline object for the fneighbor program from EMBOSS. + + fneighbor is an EMBOSS wrapper for the PHYLIP program neighbor used for + calulating neighbor-joining or UPGMA trees from distance matrices. + """ + + def __init__(self, cmd="fneighbor", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-datafile", "datafile"], + "dist file to use (phylip)", + filename=True, + is_required=True, + ), + _Option( + ["-matrixtype", "matrixtype"], + "is martrix [S]quare pr [u]pper or [l]ower", + ), + _Option(["-treetype", "treetype"], "nj or UPGMA tree (n/u)"), + _Option(["-outgrno", "outgrno"], "taxon to use as OG"), + _Option(["-jumble", "jumble"], "randommise input order (Y/n)"), + _Option(["-seed", "seed"], "provide a random seed"), + _Option(["-trout", "trout"], "write tree (Y/n)"), + _Option(["-outtreefile", "outtreefile"], "filename for output tree"), + _Option(["-progress", "progress"], "print progress (Y/n)"), + _Option(["-treeprint", "treeprint"], "print tree (Y/n)"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FSeqBootCommandline(_EmbossCommandLine): + """Commandline object for the fseqboot program from EMBOSS. + + fseqboot is an EMBOSS wrapper for the PHYLIP program seqboot used to + pseudo-sample alignment files. + """ + + def __init__(self, cmd="fseqboot", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], + "seq file to sample (phylip)", + filename=True, + is_required=True, + ), + _Option(["-categories", "catergories"], "file of input categories"), + _Option(["-weights", "weights"], " weights file"), + _Option(["-test", "test"], "specify operation, default is bootstrap"), + _Option(["-regular", "regular"], "absolute number to resample"), + _Option(["-fracsample", "fracsample"], "fraction to resample"), + _Option( + ["-rewriteformat", "rewriteformat"], + "output format ([P]hyilp, [n]exus, [x]ml", + ), + _Option(["-seqtype", "seqtype"], "output format ([D]na, [p]rotein, [r]na"), + _Option(["-blocksize", "blocksize"], "print progress (Y/n)"), + _Option(["-reps", "reps"], "how many replicates, defaults to 100)"), + _Option( + ["-justweights", "jusweights"], + "what to write out [D]atasets of just [w]eights", + ), + _Option(["-seed", "seed"], "specify random seed"), + _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FDNAParsCommandline(_EmbossCommandLine): + """Commandline object for the fdnapars program from EMBOSS. + + fdnapars is an EMBOSS version of the PHYLIP program dnapars, for + estimating trees from DNA sequences using parsiomny. Calling this command + without providing a value for the option "-intreefile" will invoke + "interactive mode" (and as a result fail if called with subprocess) if + "-auto" is not set to true. + """ + + def __init__(self, cmd="fdnapars", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], + "seq file to use (phylip)", + filename=True, + is_required=True, + ), + _Option(["-intreefile", "intreefile"], "Phylip tree file"), + _Option(["-weights", "weights"], "weights file"), + _Option(["-maxtrees", "maxtrees"], "max trees to save during run"), + _Option(["-thorough", "thorough"], "more thorough search (Y/n)"), + _Option(["-rearrange", "rearrange"], "Rearrange on just 1 best tree (Y/n)"), + _Option( + ["-transversion", "transversion"], "Use tranversion parsimony (y/N)" + ), + _Option( + ["-njumble", "njumble"], + "number of times to randomise input order (default is 0)", + ), + _Option(["-seed", "seed"], "provide random seed"), + _Option(["-outgrno", "outgrno"], "Specify outgroup"), + _Option(["-thresh", "thresh"], "Use threshold parsimony (y/N)"), + _Option(["-threshold", "threshold"], "Threshold value"), + _Option(["-trout", "trout"], "Write trees to file (Y/n)"), + _Option(["-outtreefile", "outtreefile"], "filename for output tree"), + _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FProtParsCommandline(_EmbossCommandLine): + """Commandline object for the fdnapars program from EMBOSS. + + fprotpars is an EMBOSS version of the PHYLIP program protpars, for + estimating trees from protein sequences using parsiomny. Calling this + command without providing a value for the option "-intreefile" will invoke + "interactive mode" (and as a result fail if called with subprocess) if + "-auto" is not set to true. + """ + + def __init__(self, cmd="fprotpars", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], + "seq file to use (phylip)", + filename=True, + is_required=True, + ), + _Option(["-intreefile", "intreefile"], "Phylip tree file to score"), + _Option( + ["-outtreefile", "outtreefile"], + "phylip tree output file", + filename=True, + is_required=True, + ), + _Option(["-weights", "weights"], "weights file"), + _Option(["-whichcode", "whichcode"], "which genetic code, [U,M,V,F,Y]]"), + _Option( + ["-njumble", "njumble"], + "number of times to randomise input order (default is 0)", + ), + _Option(["-seed", "seed"], "provide random seed"), + _Option(["-outgrno", "outgrno"], "Specify outgroup"), + _Option(["-thresh", "thresh"], "Use threshold parsimony (y/N)"), + _Option(["-threshold", "threshold"], "Threshold value"), + _Option(["-trout", "trout"], "Write trees to file (Y/n)"), + _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FProtDistCommandline(_EmbossCommandLine): + """Commandline object for the fprotdist program from EMBOSS. + + fprotdist is an EMBOSS wrapper for the PHYLIP program protdist used to + estimate trees from protein sequences using parsimony + """ + + def __init__(self, cmd="fprotdist", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], + "seq file to use (phylip)", + filename=True, + is_required=True, + ), + _Option( + ["-ncategories", "ncategories"], "number of rate catergories (1-9)" + ), + _Option(["-rate", "rate"], "rate for each category"), + _Option(["-catergories", "catergories"], "file of rates"), + _Option(["-weights", "weights"], "weights file"), + _Option(["-method", "method"], "sub. model [j,h,d,k,s,c]"), + _Option(["-gamma", "gamma"], "gamma [g, i,c]"), + _Option( + ["-gammacoefficient", "gammacoefficient"], "value for gamma (> 0.001)" + ), + _Option( + ["-invarcoefficient", "invarcoefficient"], + "float for variation of substitution rate among sites", + ), + _Option(["-aacateg", "aacateg"], "Choose the category to use [G,C,H]"), + _Option(["-whichcode", "whichcode"], "genetic code [c,m,v,f,y]"), + _Option(["-ease", "ease"], "Pob change catergory (float between -0 and 1)"), + _Option(["-ttratio", "ttratio"], "Transition/transversion ratio (0-1)"), + _Option( + ["-basefreq", "basefreq"], "DNA base frequencies (space separated list)" + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FConsenseCommandline(_EmbossCommandLine): + """Commandline object for the fconsense program from EMBOSS. + + fconsense is an EMBOSS wrapper for the PHYLIP program consense used to + calculate consensus trees. + """ + + def __init__(self, cmd="fconsense", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-intreefile", "intreefile"], + "file with phylip trees to make consensus from", + filename=True, + is_required=True, + ), + _Option(["-method", "method"], "consensus method [s, mr, MRE, ml]"), + _Option( + ["-mlfrac", "mlfrac"], + "cut-off freq for branch to appear in consensus (0.5-1.0)", + ), + _Option(["-root", "root"], "treat trees as rooted (YES, no)"), + _Option(["-outgrno", "outgrno"], "OTU to use as outgroup (starts from 0)"), + _Option(["-trout", "trout"], "treat trees as rooted (YES, no)"), + _Option( + ["-outtreefile", "outtreefile"], "Phylip tree output file (optional)" + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class WaterCommandline(_EmbossCommandLine): + """Commandline object for the water program from EMBOSS.""" + + def __init__(self, cmd="water", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-asequence", "asequence"], + "First sequence to align", + filename=True, + is_required=True, + ), + _Option( + ["-bsequence", "bsequence"], + "Second sequence to align", + filename=True, + is_required=True, + ), + _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True), + _Option( + ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True + ), + _Option(["-datafile", "datafile"], "Matrix file", filename=True), + _Switch( + ["-nobrief", "nobrief"], "Display extended identity and similarity" + ), + _Switch(["-brief", "brief"], "Display brief identity and similarity"), + _Option( + ["-similarity", "similarity"], "Display percent identity and similarity" + ), + _Option( + ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" + ), + _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), + _Option( + ["-aformat", "aformat"], + "Display output in a different specified output format", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class NeedleCommandline(_EmbossCommandLine): + """Commandline object for the needle program from EMBOSS.""" + + def __init__(self, cmd="needle", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-asequence", "asequence"], + "First sequence to align", + filename=True, + is_required=True, + ), + _Option( + ["-bsequence", "bsequence"], + "Second sequence to align", + filename=True, + is_required=True, + ), + _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True), + _Option( + ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True + ), + _Option(["-datafile", "datafile"], "Matrix file", filename=True), + _Option(["-endweight", "endweight"], "Apply And gap penalties"), + _Option( + ["-endopen", "endopen"], + "The score taken away when an end gap is created.", + ), + _Option( + ["-endextend", "endextend"], + "The score added to the end gap penality for each base or " + "residue in the end gap.", + ), + _Switch( + ["-nobrief", "nobrief"], "Display extended identity and similarity" + ), + _Switch(["-brief", "brief"], "Display brief identity and similarity"), + _Option( + ["-similarity", "similarity"], "Display percent identity and similarity" + ), + _Option( + ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" + ), + _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), + _Option( + ["-aformat", "aformat"], + "Display output in a different specified output format", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class NeedleallCommandline(_EmbossCommandLine): + """Commandline object for the needleall program from EMBOSS.""" + + def __init__(self, cmd="needleall", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-asequence", "asequence"], + "First sequence to align", + filename=True, + is_required=True, + ), + _Option( + ["-bsequence", "bsequence"], + "Second sequence to align", + filename=True, + is_required=True, + ), + _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True), + _Option( + ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True + ), + _Option(["-datafile", "datafile"], "Matrix file", filename=True), + _Option( + ["-minscore", "minscore"], + "Exclude alignments with scores below this threshold score.", + ), + _Option(["-errorfile", "errorfile"], "Error file to be written to."), + _Option(["-endweight", "endweight"], "Apply And gap penalties"), + _Option( + ["-endopen", "endopen"], + "The score taken away when an end gap is created.", + ), + _Option( + ["-endextend", "endextend"], + "The score added to the end gap penality for each base or " + "residue in the end gap.", + ), + _Switch( + ["-nobrief", "nobrief"], "Display extended identity and similarity" + ), + _Switch(["-brief", "brief"], "Display brief identity and similarity"), + _Option( + ["-similarity", "similarity"], "Display percent identity and similarity" + ), + _Option( + ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" + ), + _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), + _Option( + ["-aformat", "aformat"], + "Display output in a different specified output format", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class StretcherCommandline(_EmbossCommandLine): + """Commandline object for the stretcher program from EMBOSS.""" + + def __init__(self, cmd="stretcher", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-asequence", "asequence"], + "First sequence to align", + filename=True, + is_required=True, + ), + _Option( + ["-bsequence", "bsequence"], + "Second sequence to align", + filename=True, + is_required=True, + ), + _Option( + ["-gapopen", "gapopen"], + "Gap open penalty", + is_required=True, + checker_function=lambda value: isinstance(value, int), + ), + _Option( + ["-gapextend", "gapextend"], + "Gap extension penalty", + is_required=True, + checker_function=lambda value: isinstance(value, int), + ), + _Option(["-datafile", "datafile"], "Matrix file", filename=True), + _Option( + ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" + ), + _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), + _Option( + ["-aformat", "aformat"], + "Display output in a different specified output format", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FuzznucCommandline(_EmbossCommandLine): + """Commandline object for the fuzznuc program from EMBOSS.""" + + def __init__(self, cmd="fuzznuc", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], "Sequence database USA", is_required=True + ), + _Option( + ["-pattern", "pattern"], + "Search pattern, using standard IUPAC one-letter codes", + is_required=True, + ), + _Option(["-pmismatch", "pmismatch"], "Number of mismatches"), + _Option(["-complement", "complement"], "Search complementary strand"), + _Option(["-rformat", "rformat"], "Specify the report format to output in."), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class FuzzproCommandline(_EmbossCommandLine): + """Commandline object for the fuzzpro program from EMBOSS.""" + + def __init__(self, cmd="fuzzpro", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], "Sequence database USA", is_required=True + ), + _Option( + ["-pattern", "pattern"], + "Search pattern, using standard IUPAC one-letter codes", + is_required=True, + ), + _Option(["-pmismatch", "pmismatch"], "Number of mismatches"), + _Option(["-rformat", "rformat"], "Specify the report format to output in."), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class Est2GenomeCommandline(_EmbossCommandLine): + """Commandline object for the est2genome program from EMBOSS.""" + + def __init__(self, cmd="est2genome", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option(["-est", "est"], "EST sequence(s)", is_required=True), + _Option(["-genome", "genome"], "Genomic sequence", is_required=True), + _Option(["-match", "match"], "Score for matching two bases"), + _Option(["-mismatch", "mismatch"], "Cost for mismatching two bases"), + _Option( + ["-gappenalty", "gappenalty"], + "Cost for deleting a single base in either sequence, " + "excluding introns", + ), + _Option( + ["-intronpenalty", "intronpenalty"], + "Cost for an intron, independent of length.", + ), + _Option( + ["-splicepenalty", "splicepenalty"], + "Cost for an intron, independent of length " + "and starting/ending on donor-acceptor sites", + ), + _Option( + ["-minscore", "minscore"], + "Exclude alignments with scores below this threshold score.", + ), + _Option( + ["-reverse", "reverse"], "Reverse the orientation of the EST sequence" + ), + _Option(["-splice", "splice"], "Use donor and acceptor splice sites."), + _Option( + ["-mode", "mode"], + "This determines the comparion mode. 'both', 'forward', or 'reverse'", + ), + _Option( + ["-best", "best"], + "You can print out all comparisons instead of just the best", + ), + _Option(["-space", "space"], "for linear-space recursion."), + _Option(["-shuffle", "shuffle"], "Shuffle"), + _Option(["-seed", "seed"], "Random number seed"), + _Option(["-align", "align"], "Show the alignment."), + _Option(["-width", "width"], "Alignment width"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class ETandemCommandline(_EmbossCommandLine): + """Commandline object for the etandem program from EMBOSS.""" + + def __init__(self, cmd="etandem", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], "Sequence", filename=True, is_required=True + ), + _Option( + ["-minrepeat", "minrepeat"], "Minimum repeat size", is_required=True + ), + _Option( + ["-maxrepeat", "maxrepeat"], "Maximum repeat size", is_required=True + ), + _Option(["-threshold", "threshold"], "Threshold score"), + _Option(["-mismatch", "mismatch"], "Allow N as a mismatch"), + _Option(["-uniform", "uniform"], "Allow uniform consensus"), + _Option(["-rformat", "rformat"], "Output report format"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class EInvertedCommandline(_EmbossCommandLine): + """Commandline object for the einverted program from EMBOSS.""" + + def __init__(self, cmd="einverted", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], "Sequence", filename=True, is_required=True + ), + _Option(["-gap", "gap"], "Gap penalty", filename=True, is_required=True), + _Option( + ["-threshold", "threshold"], "Minimum score threshold", is_required=True + ), + _Option(["-match", "match"], "Match score", is_required=True), + _Option(["-mismatch", "mismatch"], "Mismatch score", is_required=True), + _Option( + ["-maxrepeat", "maxrepeat"], + "Maximum separation between the start and end of repeat", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class PalindromeCommandline(_EmbossCommandLine): + """Commandline object for the palindrome program from EMBOSS.""" + + def __init__(self, cmd="palindrome", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], "Sequence", filename=True, is_required=True + ), + _Option( + ["-minpallen", "minpallen"], + "Minimum palindrome length", + is_required=True, + ), + _Option( + ["-maxpallen", "maxpallen"], + "Maximum palindrome length", + is_required=True, + ), + _Option( + ["-gaplimit", "gaplimit"], + "Maximum gap between repeats", + is_required=True, + ), + _Option( + ["-nummismatches", "nummismatches"], + "Number of mismatches allowed", + is_required=True, + ), + _Option( + ["-overlap", "overlap"], "Report overlapping matches", is_required=True + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class TranalignCommandline(_EmbossCommandLine): + """Commandline object for the tranalign program from EMBOSS.""" + + def __init__(self, cmd="tranalign", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-asequence", "asequence"], + "Nucleotide sequences to be aligned.", + filename=True, + is_required=True, + ), + _Option( + ["-bsequence", "bsequence"], + "Protein sequence alignment", + filename=True, + is_required=True, + ), + _Option( + ["-outseq", "outseq"], + "Output sequence file.", + filename=True, + is_required=True, + ), + _Option(["-table", "table"], "Code to use"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class DiffseqCommandline(_EmbossCommandLine): + """Commandline object for the diffseq program from EMBOSS.""" + + def __init__(self, cmd="diffseq", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-asequence", "asequence"], + "First sequence to compare", + filename=True, + is_required=True, + ), + _Option( + ["-bsequence", "bsequence"], + "Second sequence to compare", + filename=True, + is_required=True, + ), + _Option( + ["-wordsize", "wordsize"], + "Word size to use for comparisons (10 default)", + is_required=True, + ), + _Option( + ["-aoutfeat", "aoutfeat"], + "File for output of first sequence's features", + filename=True, + is_required=True, + ), + _Option( + ["-boutfeat", "boutfeat"], + "File for output of second sequence's features", + filename=True, + is_required=True, + ), + _Option(["-rformat", "rformat"], "Output report file format"), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +class IepCommandline(_EmbossCommandLine): + """Commandline for EMBOSS iep: calculated isoelectric point and charge. + + Examples + -------- + >>> from Bio.Emboss.Applications import IepCommandline + >>> iep_cline = IepCommandline(sequence="proteins.faa", + ... outfile="proteins.txt") + >>> print(iep_cline) + iep -outfile=proteins.txt -sequence=proteins.faa + + You would typically run the command line with iep_cline() or via the + Python subprocess module, as described in the Biopython tutorial. + + """ + + def __init__(self, cmd="iep", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], + "Protein sequence(s) filename", + filename=True, + is_required=True, + ), + _Option( + ["-amino", "amino"], + """Number of N-termini + + Integer 0 (default) or more. + """, + ), + _Option( + ["-carboxyl", "carboxyl"], + """Number of C-termini + + Integer 0 (default) or more. + """, + ), + _Option( + ["-lysinemodified", "lysinemodified"], + """Number of modified lysines + + Integer 0 (default) or more. + """, + ), + _Option( + ["-disulphides", "disulphides"], + """Number of disulphide bridges + + Integer 0 (default) or more. + """, + ), + # Should we implement the -termini switch as well? + _Option( + ["-notermini", "notermini"], + "Exclude (True) or include (False) charge at N and C terminus.", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +# seqret uses -outseq, not -outfile, so use the base class: +class SeqretCommandline(_EmbossMinimalCommandLine): + """Commandline object for the seqret program from EMBOSS. + + This tool allows you to interconvert between different sequence file + formats (e.g. GenBank to FASTA). Combining Biopython's Bio.SeqIO module + with seqret using a suitable intermediate file format can allow you to + read/write to an even wider range of file formats. + + This wrapper currently only supports the core functionality, things like + feature tables (in EMBOSS 6.1.0 onwards) are not yet included. + """ + + def __init__(self, cmd="seqret", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], "Input sequence(s) filename", filename=True + ), + _Option(["-outseq", "outseq"], "Output sequence file.", filename=True), + _Option( + ["-sformat", "sformat"], + "Input sequence(s) format (e.g. fasta, genbank)", + ), + _Option( + ["-osformat", "osformat"], + "Output sequence(s) format (e.g. fasta, genbank)", + ), + ] + _EmbossMinimalCommandLine.__init__(self, cmd, **kwargs) + + def _validate(self): + # Check the outfile, filter, or stdout option has been set. + # We can't simply do this via the required flag for the outfile + # output - this seems the simplest solution. + if not (self.outseq or self.filter or self.stdout): + raise ValueError( + "You must either set outfile (output filename), " + "or enable filter or stdout (output to stdout)." + ) + if not (self.sequence or self.filter or self.stdint): + raise ValueError( + "You must either set sequence (input filename), " + "or enable filter or stdin (input from stdin)." + ) + return _EmbossMinimalCommandLine._validate(self) + + +class SeqmatchallCommandline(_EmbossCommandLine): + """Commandline object for the seqmatchall program from EMBOSS. + + e.g. + >>> cline = SeqmatchallCommandline(sequence="opuntia.fasta", outfile="opuntia.txt") + >>> cline.auto = True + >>> cline.wordsize = 18 + >>> cline.aformat = "pair" + >>> print(cline) + seqmatchall -auto -outfile=opuntia.txt -sequence=opuntia.fasta -wordsize=18 -aformat=pair + + """ + + def __init__(self, cmd="seqmatchall", **kwargs): + """Initialize the class.""" + self.parameters = [ + _Option( + ["-sequence", "sequence"], + "Readable set of sequences", + filename=True, + is_required=True, + ), + _Option( + ["-wordsize", "wordsize"], "Word size (Integer 2 or more, default 4)" + ), + _Option( + ["-aformat", "aformat"], + "Display output in a different specified output format", + ), + ] + _EmbossCommandLine.__init__(self, cmd, **kwargs) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Emboss/Primer3.py b/code/lib/Bio/Emboss/Primer3.py new file mode 100644 index 0000000..0e210a5 --- /dev/null +++ b/code/lib/Bio/Emboss/Primer3.py @@ -0,0 +1,183 @@ +# Copyright 2008 Michiel de Hoon. +# Revisions copyright 2009 Leighton Pritchard. +# Revisions copyright 2010 Peter Cock. +# All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Code to parse output from the EMBOSS eprimer3 program. + +As elsewhere in Biopython there are two input functions, read and parse, +for single record output and multi-record output. For primer3, a single +record object is created for each target sequence and may contain +multiple primers. + +i.e. If you ran eprimer3 with a single target sequence, use the read +function. If you ran eprimer3 with multiple targets, use the parse +function to iterate over the retsults. +""" + + +# --- primer3 + + +class Record: + """Represent information from a primer3 run finding primers. + + Members: + + - primers - list of Primer objects describing primer pairs for + this target sequence. + - comments - the comment line(s) for the record + + """ + + def __init__(self): + """Initialize the class.""" + self.comments = "" + self.primers = [] + + +class Primers: + """A primer set designed by Primer3. + + Members: + + - size - length of product, note you can use len(primer) as an + alternative to primer.size + + - forward_seq + - forward_start + - forward_length + - forward_tm + - forward_gc + + - reverse_seq + - reverse_start + - reverse_length + - reverse_tm + - reverse_gc + + - internal_seq + - internal_start + - internal_length + - internal_tm + - internal_gc + + """ + + def __init__(self): + """Initialize the class.""" + self.size = 0 + self.forward_seq = "" + self.forward_start = 0 + self.forward_length = 0 + self.forward_tm = 0.0 + self.forward_gc = 0.0 + self.reverse_seq = "" + self.reverse_start = 0 + self.reverse_length = 0 + self.reverse_tm = 0.0 + self.reverse_gc = 0.0 + self.internal_seq = "" + self.internal_start = 0 + self.internal_length = 0 + self.internal_tm = 0.0 + self.internal_gc = 0.0 + + def __len__(self): + """Length of the primer product (i.e. product size).""" + return self.size + + +def parse(handle): + """Iterate over primer3 output as Bio.Emboss.Primer3.Record objects.""" + # Skip blank lines at head of file + while True: + line = handle.readline() + if line.strip(): + break # Starting a record + + # Read each record + record = None + primer = None + while True: + if line.startswith("# EPRIMER3") or line.startswith("# PRIMER3"): + # Record data + if record is not None: + yield record + record = Record() + record.comments += line + primer = None + elif line.startswith("#"): + if ( + line.strip() + != "# Start Len Tm GC% Sequence" + ): + record.comments += line + elif not line.strip(): + pass + elif line[5:19] == "PRODUCT SIZE: ": + primer = Primers() + primer.size = int(line[19:]) + record.primers.append(primer) + elif line[5:19] == "FORWARD PRIMER": + words = line.split() + if not primer or primer.size == 0: + primer = Primers() + record.primers.append(primer) + primer.forward_start = int(words[2]) + primer.forward_length = int(words[3]) + primer.forward_tm = float(words[4]) + primer.forward_gc = float(words[5]) + primer.forward_seq = words[6] + elif line[5:19] == "REVERSE PRIMER": + words = line.split() + if not primer or primer.size == 0: + primer = Primers() + record.primers.append(primer) + primer.reverse_start = int(words[2]) + primer.reverse_length = int(words[3]) + primer.reverse_tm = float(words[4]) + primer.reverse_gc = float(words[5]) + primer.reverse_seq = words[6] + elif line[5:19] == "INTERNAL OLIGO": + words = line.split() + if not primer or primer.size == 0: + primer = Primers() + record.primers.append(primer) + primer.internal_start = int(words[2]) + primer.internal_length = int(words[3]) + primer.internal_tm = float(words[4]) + primer.internal_gc = float(words[5]) + try: + primer.internal_seq = words[6] + except IndexError: # eprimer3 reports oligo without sequence + primer.internal_seq = "" + try: + line = next(handle) + except StopIteration: + break + if record: + yield record + + +def read(handle): + """Parse primer3 output into a Bio.Emboss.Primer3.Record object. + + This is for when there is one and only one target sequence. If + designing primers for multiple sequences, use the parse function. + """ + iterator = parse(handle) + try: + record = next(iterator) + except StopIteration: + raise ValueError("No records found in handle") from None + try: + next(iterator) + raise ValueError("More than one record found in handle") + except StopIteration: + pass + return record diff --git a/code/lib/Bio/Emboss/PrimerSearch.py b/code/lib/Bio/Emboss/PrimerSearch.py new file mode 100644 index 0000000..3a7fb7a --- /dev/null +++ b/code/lib/Bio/Emboss/PrimerSearch.py @@ -0,0 +1,80 @@ +# Copyright 2008 Michiel de Hoon. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Code to interact with the primersearch program from EMBOSS.""" + + +class InputRecord: + """Represent the input file into the primersearch program. + + This makes it easy to add primer information and write it out to the + simple primer file format. + """ + + def __init__(self): + """Initialize the class.""" + self.primer_info = [] + + def __str__(self): + """Summarize the primersearch input record as a string.""" + output = "" + for name, primer1, primer2 in self.primer_info: + output += "%s %s %s\n" % (name, primer1, primer2) + return output + + def add_primer_set(self, primer_name, first_primer_seq, second_primer_seq): + """Add primer information to the record.""" + self.primer_info.append((primer_name, first_primer_seq, second_primer_seq)) + + +class OutputRecord: + """Represent the information from a primersearch job. + + amplifiers is a dictionary where the keys are the primer names and + the values are a list of PrimerSearchAmplifier objects. + """ + + def __init__(self): + """Initialize the class.""" + self.amplifiers = {} + + +class Amplifier: + """Represent a single amplification from a primer.""" + + def __init__(self): + """Initialize the class.""" + self.hit_info = "" + self.length = 0 + + +def read(handle): + """Get output from primersearch into a PrimerSearchOutputRecord.""" + record = OutputRecord() + + for line in handle: + if not line.strip(): + continue + elif line.startswith("Primer name"): + name = line.split()[-1] + record.amplifiers[name] = [] + elif line.startswith("Amplimer"): + amplifier = Amplifier() + record.amplifiers[name].append(amplifier) + elif line.startswith("\tSequence: "): + amplifier.hit_info = line.replace("\tSequence: ", "") + elif line.startswith("\tAmplimer length: "): + length = line.split()[-2] + amplifier.length = int(length) + else: + amplifier.hit_info += line + + for name in record.amplifiers: + for amplifier in record.amplifiers[name]: + amplifier.hit_info = amplifier.hit_info.rstrip() + + return record diff --git a/code/lib/Bio/Emboss/__init__.py b/code/lib/Bio/Emboss/__init__.py new file mode 100644 index 0000000..630780d --- /dev/null +++ b/code/lib/Bio/Emboss/__init__.py @@ -0,0 +1,8 @@ +# Copyright 2001 Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Code to interact with the ever-so-useful EMBOSS programs.""" diff --git a/code/lib/Bio/Emboss/__pycache__/Applications.cpython-37.pyc b/code/lib/Bio/Emboss/__pycache__/Applications.cpython-37.pyc new file mode 100644 index 0000000..56718f6 Binary files /dev/null and b/code/lib/Bio/Emboss/__pycache__/Applications.cpython-37.pyc differ diff --git a/code/lib/Bio/Emboss/__pycache__/Primer3.cpython-37.pyc b/code/lib/Bio/Emboss/__pycache__/Primer3.cpython-37.pyc new file mode 100644 index 0000000..6302571 Binary files /dev/null and b/code/lib/Bio/Emboss/__pycache__/Primer3.cpython-37.pyc differ diff --git a/code/lib/Bio/Emboss/__pycache__/PrimerSearch.cpython-37.pyc b/code/lib/Bio/Emboss/__pycache__/PrimerSearch.cpython-37.pyc new file mode 100644 index 0000000..808a8cb Binary files /dev/null and b/code/lib/Bio/Emboss/__pycache__/PrimerSearch.cpython-37.pyc differ diff --git a/code/lib/Bio/Emboss/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Emboss/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..3515132 Binary files /dev/null and b/code/lib/Bio/Emboss/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_0.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_0.dtd new file mode 100644 index 0000000..43a18d0 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_0.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%Docsum_3_0_module; diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_0.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_0.mod.dtd new file mode 100644 index 0000000..64a4549 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_0.mod.dtd @@ -0,0 +1,1054 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_1.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_1.dtd new file mode 100644 index 0000000..a82d6a8 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_1.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%Docsum_3_1_module; diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_1.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_1.mod.dtd new file mode 100644 index 0000000..ce57767 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_1.mod.dtd @@ -0,0 +1,1055 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_2.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_2.dtd new file mode 100644 index 0000000..0c04c5e --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_2.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%Docsum_3_2_module; diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_2.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_2.mod.dtd new file mode 100644 index 0000000..fd17c81 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_2.mod.dtd @@ -0,0 +1,1418 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_3.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_3.dtd new file mode 100644 index 0000000..36da5a0 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_3.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%Docsum_3_3_module; diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_3.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_3.mod.dtd new file mode 100644 index 0000000..c1c1169 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_3.mod.dtd @@ -0,0 +1,1585 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_4.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_4.dtd new file mode 100644 index 0000000..89ba0bc --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_4.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%Docsum_3_4_module; diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_4.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_4.mod.dtd new file mode 100644 index 0000000..51899a8 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_4.mod.dtd @@ -0,0 +1,1594 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/EMBL_General.dtd b/code/lib/Bio/Entrez/DTDs/EMBL_General.dtd new file mode 100644 index 0000000..267f9e1 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/EMBL_General.dtd @@ -0,0 +1,20 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%NCBI_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/EMBL_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/EMBL_General.mod.dtd new file mode 100644 index 0000000..1f6f6fa --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/EMBL_General.mod.dtd @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/GenBank_General.dtd b/code/lib/Bio/Entrez/DTDs/GenBank_General.dtd new file mode 100644 index 0000000..c8707a9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/GenBank_General.dtd @@ -0,0 +1,20 @@ + + + + + +%NCBI_Entity_module; + + +%GenBank_General_module; + + +%NCBI_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/GenBank_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/GenBank_General.mod.dtd new file mode 100644 index 0000000..0cba454 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/GenBank_General.mod.dtd @@ -0,0 +1,65 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/HomoloGene.dtd b/code/lib/Bio/Entrez/DTDs/HomoloGene.dtd new file mode 100644 index 0000000..82262e4 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/HomoloGene.dtd @@ -0,0 +1,89 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%HomoloGene_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/HomoloGene.mod.dtd b/code/lib/Bio/Entrez/DTDs/HomoloGene.mod.dtd new file mode 100644 index 0000000..c88a5cd --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/HomoloGene.mod.dtd @@ -0,0 +1,293 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.dtd b/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.dtd new file mode 100644 index 0000000..fdf3b96 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%INSD_INSDSeq_module; diff --git a/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.mod.dtd b/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.mod.dtd new file mode 100644 index 0000000..308423d --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.mod.dtd @@ -0,0 +1,491 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/MMDB.dtd b/code/lib/Bio/Entrez/DTDs/MMDB.dtd new file mode 100644 index 0000000..738efa9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/MMDB.dtd @@ -0,0 +1,98 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%MMDB_module; + + +%MMDB_Chemical_graph_module; + + +%MMDB_Features_module; + + +%MMDB_Structural_model_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/MMDB.mod.dtd b/code/lib/Bio/Entrez/DTDs/MMDB.mod.dtd new file mode 100644 index 0000000..8424533 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/MMDB.mod.dtd @@ -0,0 +1,259 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.dtd new file mode 100644 index 0000000..fd56bf7 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.dtd @@ -0,0 +1,98 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%MMDB_module; + + +%MMDB_Chemical_graph_module; + + +%MMDB_Features_module; + + +%MMDB_Structural_model_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.mod.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.mod.dtd new file mode 100644 index 0000000..5763354 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.mod.dtd @@ -0,0 +1,561 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Features.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Features.dtd new file mode 100644 index 0000000..b8eb295 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/MMDB_Features.dtd @@ -0,0 +1,98 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%MMDB_module; + + +%MMDB_Chemical_graph_module; + + +%MMDB_Features_module; + + +%MMDB_Structural_model_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Features.mod.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Features.mod.dtd new file mode 100644 index 0000000..160fb02 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/MMDB_Features.mod.dtd @@ -0,0 +1,932 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.dtd new file mode 100644 index 0000000..a5a075e --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.dtd @@ -0,0 +1,98 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%MMDB_module; + + +%MMDB_Chemical_graph_module; + + +%MMDB_Features_module; + + +%MMDB_Structural_model_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.mod.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.mod.dtd new file mode 100644 index 0000000..aa9f16c --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.mod.dtd @@ -0,0 +1,676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Access.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Access.dtd new file mode 100644 index 0000000..5dcedf0 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Access.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Access_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Access.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Access.mod.dtd new file mode 100644 index 0000000..e83ad2a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Access.mod.dtd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.dtd new file mode 100644 index 0000000..17edc09 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.dtd @@ -0,0 +1,20 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Biblio_module; + + +%NCBI_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.mod.dtd new file mode 100644 index 0000000..e0b7e1d --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.mod.dtd @@ -0,0 +1,690 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.dtd new file mode 100644 index 0000000..2bfea5c --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.dtd @@ -0,0 +1,23 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_BioSource_module; + + +%NCBI_General_module; + + +%NCBI_Organism_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.mod.dtd new file mode 100644 index 0000000..6b5c5da --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.mod.dtd @@ -0,0 +1,200 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.dtd new file mode 100644 index 0000000..26ba5d9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_BioTree_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.mod.dtd new file mode 100644 index 0000000..7a2ad1a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.mod.dtd @@ -0,0 +1,109 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.dtd new file mode 100644 index 0000000..24437c5 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.dtd @@ -0,0 +1,95 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Blast4_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_ScoreMat_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.mod.dtd new file mode 100644 index 0000000..f001a47 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.mod.dtd @@ -0,0 +1,1498 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.dtd new file mode 100644 index 0000000..451e782 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.dtd @@ -0,0 +1,89 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_BlastDL_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.mod.dtd new file mode 100644 index 0000000..78a99bb --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.mod.dtd @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.dtd new file mode 100644 index 0000000..307176a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_BlastOutput_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.mod.dtd new file mode 100644 index 0000000..7b0f47f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.mod.dtd @@ -0,0 +1,273 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.dtd new file mode 100644 index 0000000..b2d06ad --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.dtd @@ -0,0 +1,110 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%MMDB_module; + + +%MMDB_Chemical_graph_module; + + +%MMDB_Features_module; + + +%MMDB_Structural_model_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Cdd_module; + + +%NCBI_Cn3d_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_ScoreMat_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.mod.dtd new file mode 100644 index 0000000..7cf68d6 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.mod.dtd @@ -0,0 +1,1088 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.dtd new file mode 100644 index 0000000..9558045 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.dtd @@ -0,0 +1,101 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%MMDB_module; + + +%MMDB_Chemical_graph_module; + + +%MMDB_Features_module; + + +%MMDB_Structural_model_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Cn3d_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.mod.dtd new file mode 100644 index 0000000..16a815f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.mod.dtd @@ -0,0 +1,534 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entity.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entity.mod.dtd new file mode 100644 index 0000000..3919c3e --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entity.mod.dtd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.dtd new file mode 100644 index 0000000..a08a907 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Entrez2_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.mod.dtd new file mode 100644 index 0000000..5eb72c9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.mod.dtd @@ -0,0 +1,747 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.dtd new file mode 100644 index 0000000..36e206f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.dtd @@ -0,0 +1,89 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Entrezgene_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.mod.dtd new file mode 100644 index 0000000..c75d32f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.mod.dtd @@ -0,0 +1,394 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.dtd new file mode 100644 index 0000000..a3ce559 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_FeatDef_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.mod.dtd new file mode 100644 index 0000000..65fbf90 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.mod.dtd @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.dtd new file mode 100644 index 0000000..d317e96 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_GBSeq_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.mod.dtd new file mode 100644 index 0000000..95be4f3 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.mod.dtd @@ -0,0 +1,407 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Gene.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Gene.dtd new file mode 100644 index 0000000..cd6d122 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Gene.dtd @@ -0,0 +1,20 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Gene.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Gene.mod.dtd new file mode 100644 index 0000000..be703af --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Gene.mod.dtd @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_General.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_General.dtd new file mode 100644 index 0000000..a8bb6c1 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_General.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_General.mod.dtd new file mode 100644 index 0000000..c573ca5 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_General.mod.dtd @@ -0,0 +1,333 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.dtd new file mode 100644 index 0000000..8e57ced --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.dtd @@ -0,0 +1,92 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_ID1Access_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.mod.dtd new file mode 100644 index 0000000..b489907 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.mod.dtd @@ -0,0 +1,218 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.dtd new file mode 100644 index 0000000..4adbfa5 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.dtd @@ -0,0 +1,95 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_ID2Access_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_Seq_split_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.mod.dtd new file mode 100644 index 0000000..5d5ecf7 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.mod.dtd @@ -0,0 +1,759 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.dtd new file mode 100644 index 0000000..1082302 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.dtd @@ -0,0 +1,35 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Biblio_module; + + +%NCBI_General_module; + + +%NCBI_MedArchive_module; + + +%NCBI_Medlars_module; + + +%NCBI_Medline_module; + + +%NCBI_Pub_module; + + +%NCBI_PubMed_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.mod.dtd new file mode 100644 index 0000000..b4c2701 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.mod.dtd @@ -0,0 +1,271 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.dtd new file mode 100644 index 0000000..6d1410c --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.dtd @@ -0,0 +1,23 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Biblio_module; + + +%NCBI_General_module; + + +%NCBI_Medlars_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.mod.dtd new file mode 100644 index 0000000..cb8d48a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.mod.dtd @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Medline.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Medline.dtd new file mode 100644 index 0000000..9495345 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Medline.dtd @@ -0,0 +1,23 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Biblio_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Medline.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Medline.mod.dtd new file mode 100644 index 0000000..b05a78e --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Medline.mod.dtd @@ -0,0 +1,245 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Mim.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Mim.dtd new file mode 100644 index 0000000..7dc862b --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Mim.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Mim_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Mim.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Mim.mod.dtd new file mode 100644 index 0000000..664a851 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Mim.mod.dtd @@ -0,0 +1,354 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Mime.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Mime.dtd new file mode 100644 index 0000000..a7be929 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Mime.dtd @@ -0,0 +1,113 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%MMDB_module; + + +%MMDB_Chemical_graph_module; + + +%MMDB_Features_module; + + +%MMDB_Structural_model_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Cdd_module; + + +%NCBI_Cn3d_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Mime_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_ScoreMat_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Mime.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Mime.mod.dtd new file mode 100644 index 0000000..a7f8ef4 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Mime.mod.dtd @@ -0,0 +1,251 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.dtd new file mode 100644 index 0000000..ae5196a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_ObjPrt_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.mod.dtd new file mode 100644 index 0000000..23f916a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.mod.dtd @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Organism.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Organism.dtd new file mode 100644 index 0000000..b06e17e --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Organism.dtd @@ -0,0 +1,20 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_General_module; + + +%NCBI_Organism_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Organism.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Organism.mod.dtd new file mode 100644 index 0000000..9c36c43 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Organism.mod.dtd @@ -0,0 +1,226 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.dtd new file mode 100644 index 0000000..7b35bb2 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.dtd @@ -0,0 +1,38 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_PCAssay_module; + + +%NCBI_PCSubstance_module; + + +%NCBI_Pub_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.mod.dtd new file mode 100644 index 0000000..020ab07 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.mod.dtd @@ -0,0 +1,1006 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.dtd new file mode 100644 index 0000000..0efe6fd --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.dtd @@ -0,0 +1,29 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Biblio_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_PCSubstance_module; + + +%NCBI_Pub_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.mod.dtd new file mode 100644 index 0000000..479ed86 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.mod.dtd @@ -0,0 +1,1628 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Project.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Project.dtd new file mode 100644 index 0000000..4d42013 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Project.dtd @@ -0,0 +1,95 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Project_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_PubMed_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Project.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Project.mod.dtd new file mode 100644 index 0000000..e2215fe --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Project.mod.dtd @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Protein.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Protein.dtd new file mode 100644 index 0000000..e8279ea --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Protein.dtd @@ -0,0 +1,20 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_General_module; + + +%NCBI_Protein_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Protein.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Protein.mod.dtd new file mode 100644 index 0000000..e833a5d --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Protein.mod.dtd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Pub.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Pub.dtd new file mode 100644 index 0000000..6a52954 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Pub.dtd @@ -0,0 +1,26 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Biblio_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Pub_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Pub.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Pub.mod.dtd new file mode 100644 index 0000000..ca92c18 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Pub.mod.dtd @@ -0,0 +1,120 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.dtd new file mode 100644 index 0000000..b272da6 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.dtd @@ -0,0 +1,26 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Biblio_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_PubMed_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.mod.dtd new file mode 100644 index 0000000..4313a76 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.mod.dtd @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_RNA.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_RNA.dtd new file mode 100644 index 0000000..c64fad9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_RNA.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_RNA.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_RNA.mod.dtd new file mode 100644 index 0000000..b1c7991 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_RNA.mod.dtd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Remap.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Remap.dtd new file mode 100644 index 0000000..4696a50 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Remap.dtd @@ -0,0 +1,89 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Remap_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Remap.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Remap.mod.dtd new file mode 100644 index 0000000..9f14d35 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Remap.mod.dtd @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.dtd new file mode 100644 index 0000000..7bcf4de --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.dtd @@ -0,0 +1,20 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_General_module; + + +%NCBI_Rsite_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.mod.dtd new file mode 100644 index 0000000..1758ab1 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.mod.dtd @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.dtd new file mode 100644 index 0000000..64a676b --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.dtd @@ -0,0 +1,92 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_ScoreMat_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.mod.dtd new file mode 100644 index 0000000..d79b8e0 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.mod.dtd @@ -0,0 +1,579 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.dtd new file mode 100644 index 0000000..3e754ac --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_SeqCode_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.mod.dtd new file mode 100644 index 0000000..1e60966 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.mod.dtd @@ -0,0 +1,150 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.dtd new file mode 100644 index 0000000..f47d9ba --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.mod.dtd new file mode 100644 index 0000000..54232b3 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.mod.dtd @@ -0,0 +1,390 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.dtd new file mode 100644 index 0000000..d705b7c --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.dtd @@ -0,0 +1,92 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_Seq_split_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.mod.dtd new file mode 100644 index 0000000..4ab94ad --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.mod.dtd @@ -0,0 +1,559 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.dtd new file mode 100644 index 0000000..08e37eb --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.mod.dtd new file mode 100644 index 0000000..721a351 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.mod.dtd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.dtd new file mode 100644 index 0000000..48af83f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.mod.dtd new file mode 100644 index 0000000..fb7dc75 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.mod.dtd @@ -0,0 +1,772 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.dtd new file mode 100644 index 0000000..a0464a4 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.mod.dtd new file mode 100644 index 0000000..7daa894 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.mod.dtd @@ -0,0 +1,325 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.dtd new file mode 100644 index 0000000..353ff75 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.mod.dtd new file mode 100644 index 0000000..27dba62 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.mod.dtd @@ -0,0 +1,134 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.dtd new file mode 100644 index 0000000..69de4db --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.dtd @@ -0,0 +1,89 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.mod.dtd new file mode 100644 index 0000000..549d8d6 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.mod.dtd @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.dtd new file mode 100644 index 0000000..78673f6 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.mod.dtd new file mode 100644 index 0000000..d09ae02 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.mod.dtd @@ -0,0 +1,1112 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Submit.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Submit.dtd new file mode 100644 index 0000000..b81b188 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Submit.dtd @@ -0,0 +1,92 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_Submit_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Submit.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Submit.mod.dtd new file mode 100644 index 0000000..64885f0 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Submit.mod.dtd @@ -0,0 +1,156 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Systems.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Systems.dtd new file mode 100644 index 0000000..d1f8c2a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Systems.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_Systems_module; + + +%NCBI_TxInit_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.dtd new file mode 100644 index 0000000..9463557 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_TSeq_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.mod.dtd new file mode 100644 index 0000000..e7fc3f4 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.mod.dtd @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.dtd new file mode 100644 index 0000000..fe9a6c1 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.dtd @@ -0,0 +1,29 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_TxInit_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.mod.dtd new file mode 100644 index 0000000..d5d97c1 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.mod.dtd @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Variation.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Variation.dtd new file mode 100644 index 0000000..7a992b1 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Variation.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Variation.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Variation.mod.dtd new file mode 100644 index 0000000..ee7f020 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_Variation.mod.dtd @@ -0,0 +1,944 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_all.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_all.dtd new file mode 100644 index 0000000..3ed101d --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NCBI_all.dtd @@ -0,0 +1,202 @@ + + +%NCBI_Entity_module; + + +%Docsum_3_0_module; + + +%Docsum_3_1_module; + + +%Docsum_3_2_module; + + +%Docsum_3_3_module; + + +%Docsum_3_4_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%HomoloGene_module; + + +%INSD_INSDSeq_module; + + +%MMDB_module; + + +%MMDB_Chemical_graph_module; + + +%MMDB_Features_module; + + +%MMDB_Structural_model_module; + + +%NCBI_Access_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_BioTree_module; + + +%NCBI_Blast4_module; + + +%NCBI_BlastDL_module; + + +%NCBI_BlastOutput_module; + + +%NCBI_Cdd_module; + + +%NCBI_Cn3d_module; + + +%NCBI_Entrez2_module; + + +%NCBI_Entrezgene_module; + + +%NCBI_FeatDef_module; + + +%NCBI_GBSeq_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_ID1Access_module; + + +%NCBI_ID2Access_module; + + +%NCBI_MedArchive_module; + + +%NCBI_Medlars_module; + + +%NCBI_Medline_module; + + +%NCBI_Mim_module; + + +%NCBI_Mime_module; + + +%NCBI_ObjPrt_module; + + +%NCBI_Organism_module; + + +%NCBI_PCAssay_module; + + +%NCBI_PCSubstance_module; + + +%NCBI_Project_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_PubMed_module; + + +%NCBI_RNA_module; + + +%NCBI_Remap_module; + + +%NCBI_Rsite_module; + + +%NCBI_ScoreMat_module; + + +%NCBI_SeqCode_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seq_split_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Seqset_module; + + +%NCBI_Sequence_module; + + +%NCBI_Submit_module; + + +%NCBI_TSeq_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%NSE_module; + + +%OMSSA_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; + diff --git a/code/lib/Bio/Entrez/DTDs/NSE.dtd b/code/lib/Bio/Entrez/DTDs/NSE.dtd new file mode 100644 index 0000000..74f0075 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NSE.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%NSE_module; diff --git a/code/lib/Bio/Entrez/DTDs/NSE.mod.dtd b/code/lib/Bio/Entrez/DTDs/NSE.mod.dtd new file mode 100644 index 0000000..4327f48 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/NSE.mod.dtd @@ -0,0 +1,895 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/OMSSA.dtd b/code/lib/Bio/Entrez/DTDs/OMSSA.dtd new file mode 100644 index 0000000..4ab5adc --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/OMSSA.dtd @@ -0,0 +1,89 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%OMSSA_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/OMSSA.mod.dtd b/code/lib/Bio/Entrez/DTDs/OMSSA.mod.dtd new file mode 100644 index 0000000..26f050e --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/OMSSA.mod.dtd @@ -0,0 +1,1361 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/PDB_General.dtd b/code/lib/Bio/Entrez/DTDs/PDB_General.dtd new file mode 100644 index 0000000..cb2b663 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/PDB_General.dtd @@ -0,0 +1,20 @@ + + + + + +%NCBI_Entity_module; + + +%NCBI_General_module; + + +%PDB_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/PDB_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/PDB_General.mod.dtd new file mode 100644 index 0000000..1e5aae9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/PDB_General.mod.dtd @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/PIR_General.dtd b/code/lib/Bio/Entrez/DTDs/PIR_General.dtd new file mode 100644 index 0000000..15f7879 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/PIR_General.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/PIR_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/PIR_General.mod.dtd new file mode 100644 index 0000000..bedc2ed --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/PIR_General.mod.dtd @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/PRF_General.dtd b/code/lib/Bio/Entrez/DTDs/PRF_General.dtd new file mode 100644 index 0000000..b6cf457 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/PRF_General.dtd @@ -0,0 +1,17 @@ + + + + + +%NCBI_Entity_module; + + +%PRF_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/PRF_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/PRF_General.mod.dtd new file mode 100644 index 0000000..e09cc60 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/PRF_General.mod.dtd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/SP_General.dtd b/code/lib/Bio/Entrez/DTDs/SP_General.dtd new file mode 100644 index 0000000..8a981ec --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/SP_General.dtd @@ -0,0 +1,86 @@ + + + + + +%NCBI_Entity_module; + + +%EMBL_General_module; + + +%GenBank_General_module; + + +%NCBI_Biblio_module; + + +%NCBI_BioSource_module; + + +%NCBI_Gene_module; + + +%NCBI_General_module; + + +%NCBI_Medline_module; + + +%NCBI_Organism_module; + + +%NCBI_Protein_module; + + +%NCBI_Pub_module; + + +%NCBI_RNA_module; + + +%NCBI_Rsite_module; + + +%NCBI_SeqTable_module; + + +%NCBI_Seqalign_module; + + +%NCBI_Seqfeat_module; + + +%NCBI_Seqloc_module; + + +%NCBI_Seqres_module; + + +%NCBI_Sequence_module; + + +%NCBI_TxInit_module; + + +%NCBI_Variation_module; + + +%PDB_General_module; + + +%PIR_General_module; + + +%PRF_General_module; + + +%SP_General_module; diff --git a/code/lib/Bio/Entrez/DTDs/SP_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/SP_General.mod.dtd new file mode 100644 index 0000000..cab7937 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/SP_General.mod.dtd @@ -0,0 +1,94 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/XHTMLtablesetup.ent b/code/lib/Bio/Entrez/DTDs/XHTMLtablesetup.ent new file mode 100644 index 0000000..1ebe3ce --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/XHTMLtablesetup.ent @@ -0,0 +1,309 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%htmltable.dtd; + + + diff --git a/code/lib/Bio/Entrez/DTDs/archivearticle.dtd b/code/lib/Bio/Entrez/DTDs/archivearticle.dtd new file mode 100644 index 0000000..0b81d6a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/archivearticle.dtd @@ -0,0 +1,952 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%archivecustom-modules.ent; + + + + + +%modules.ent; + + + + + + + + + + + + + +%archivecustom-classes.ent; + + + + +%default-classes.ent; + + + + +%archivecustom-mixes.ent; + + + + +%default-mixes.ent; + + + + +%archivecustom-models.ent; + + + + + + + + + + +%common.ent; + + + + + + + +%articlemeta.ent; + + + +%backmatter.ent; + + + +%display.ent; + + + + +%format.ent; + + + +%journalmeta.ent; + + + +%link.ent; + + + +%list.ent; + + + +%math.ent; + + + +%para.ent; + + + +%phrase.ent; + + + +%references.ent; + + + +%section.ent; + + + + + + + + + +%mathmlsetup.ent; + + + + + +%XHTMLtablesetup.ent; + + + + +%xmlspecchars.ent; + + + + +%chars.ent; + + + +%notat.ent; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/archivecustom-classes.ent b/code/lib/Bio/Entrez/DTDs/archivecustom-classes.ent new file mode 100644 index 0000000..3d665ad --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/archivecustom-classes.ent @@ -0,0 +1,157 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/archivecustom-mixes.ent b/code/lib/Bio/Entrez/DTDs/archivecustom-mixes.ent new file mode 100644 index 0000000..a5e1b05 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/archivecustom-mixes.ent @@ -0,0 +1,306 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/archivecustom-models.ent b/code/lib/Bio/Entrez/DTDs/archivecustom-models.ent new file mode 100644 index 0000000..eb494c0 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/archivecustom-models.ent @@ -0,0 +1,756 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/archivecustom-modules.ent b/code/lib/Bio/Entrez/DTDs/archivecustom-modules.ent new file mode 100644 index 0000000..36fbc43 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/archivecustom-modules.ent @@ -0,0 +1,116 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/articlemeta.ent b/code/lib/Bio/Entrez/DTDs/articlemeta.ent new file mode 100644 index 0000000..f594afe --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/articlemeta.ent @@ -0,0 +1,1811 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/backmatter.ent b/code/lib/Bio/Entrez/DTDs/backmatter.ent new file mode 100644 index 0000000..1ece324 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/backmatter.ent @@ -0,0 +1,277 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_100301.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_100301.dtd new file mode 100644 index 0000000..78e2bae --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/bookdoc_100301.dtd @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_110101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_110101.dtd new file mode 100644 index 0000000..78e2bae --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/bookdoc_110101.dtd @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_120101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_120101.dtd new file mode 100644 index 0000000..78e2bae --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/bookdoc_120101.dtd @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_130101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_130101.dtd new file mode 100644 index 0000000..8a4f338 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/bookdoc_130101.dtd @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_140101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_140101.dtd new file mode 100644 index 0000000..8a4f338 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/bookdoc_140101.dtd @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_150101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_150101.dtd new file mode 100644 index 0000000..8a4f338 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/bookdoc_150101.dtd @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/chars.ent b/code/lib/Bio/Entrez/DTDs/chars.ent new file mode 100644 index 0000000..19b6313 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/chars.ent @@ -0,0 +1,359 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/common.ent b/code/lib/Bio/Entrez/DTDs/common.ent new file mode 100644 index 0000000..c1907d6 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/common.ent @@ -0,0 +1,2790 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/default-classes.ent b/code/lib/Bio/Entrez/DTDs/default-classes.ent new file mode 100644 index 0000000..81d1155 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/default-classes.ent @@ -0,0 +1,704 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/default-mixes.ent b/code/lib/Bio/Entrez/DTDs/default-mixes.ent new file mode 100644 index 0000000..2f28bd9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/default-mixes.ent @@ -0,0 +1,357 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/display.ent b/code/lib/Bio/Entrez/DTDs/display.ent new file mode 100644 index 0000000..ce81f7f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/display.ent @@ -0,0 +1,1468 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/code/lib/Bio/Entrez/DTDs/eInfo_020511.dtd b/code/lib/Bio/Entrez/DTDs/eInfo_020511.dtd new file mode 100644 index 0000000..ac4a59f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/eInfo_020511.dtd @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/eLink_090910.dtd b/code/lib/Bio/Entrez/DTDs/eLink_090910.dtd new file mode 100644 index 0000000..6aa4c47 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/eLink_090910.dtd @@ -0,0 +1,79 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/eLink_101123.dtd b/code/lib/Bio/Entrez/DTDs/eLink_101123.dtd new file mode 100644 index 0000000..934b5a9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/eLink_101123.dtd @@ -0,0 +1,88 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/ePost_020511.dtd b/code/lib/Bio/Entrez/DTDs/ePost_020511.dtd new file mode 100644 index 0000000..3da7498 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/ePost_020511.dtd @@ -0,0 +1,14 @@ + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/eSearch_020511.dtd b/code/lib/Bio/Entrez/DTDs/eSearch_020511.dtd new file mode 100644 index 0000000..15e734b --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/eSearch_020511.dtd @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/eSpell.dtd b/code/lib/Bio/Entrez/DTDs/eSpell.dtd new file mode 100644 index 0000000..18b6265 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/eSpell.dtd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/eSummary_041029.dtd b/code/lib/Bio/Entrez/DTDs/eSummary_041029.dtd new file mode 100644 index 0000000..a10572a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/eSummary_041029.dtd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/egquery.dtd b/code/lib/Bio/Entrez/DTDs/egquery.dtd new file mode 100644 index 0000000..ff53342 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/egquery.dtd @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/einfo.dtd b/code/lib/Bio/Entrez/DTDs/einfo.dtd new file mode 100644 index 0000000..f42e108 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/einfo.dtd @@ -0,0 +1,62 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/elink_020122.dtd b/code/lib/Bio/Entrez/DTDs/elink_020122.dtd new file mode 100644 index 0000000..6f93374 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/elink_020122.dtd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/esearch.dtd b/code/lib/Bio/Entrez/DTDs/esearch.dtd new file mode 100644 index 0000000..bd11e35 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/esearch.dtd @@ -0,0 +1,103 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/esummary-v1.dtd b/code/lib/Bio/Entrez/DTDs/esummary-v1.dtd new file mode 100644 index 0000000..a10572a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/esummary-v1.dtd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/format.ent b/code/lib/Bio/Entrez/DTDs/format.ent new file mode 100644 index 0000000..b702a9b --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/format.ent @@ -0,0 +1,412 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/htmltable.dtd b/code/lib/Bio/Entrez/DTDs/htmltable.dtd new file mode 100644 index 0000000..f4432ad --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/htmltable.dtd @@ -0,0 +1,334 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + + diff --git a/code/lib/Bio/Entrez/DTDs/isoamsa.ent b/code/lib/Bio/Entrez/DTDs/isoamsa.ent new file mode 100644 index 0000000..c413168 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isoamsa.ent @@ -0,0 +1,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isoamsb.ent b/code/lib/Bio/Entrez/DTDs/isoamsb.ent new file mode 100644 index 0000000..b74414b --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isoamsb.ent @@ -0,0 +1,143 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isoamsc.ent b/code/lib/Bio/Entrez/DTDs/isoamsc.ent new file mode 100644 index 0000000..46ea221 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isoamsc.ent @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isoamsn.ent b/code/lib/Bio/Entrez/DTDs/isoamsn.ent new file mode 100644 index 0000000..a1df8b7 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isoamsn.ent @@ -0,0 +1,114 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isoamso.ent b/code/lib/Bio/Entrez/DTDs/isoamso.ent new file mode 100644 index 0000000..f99cf11 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isoamso.ent @@ -0,0 +1,73 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isoamsr.ent b/code/lib/Bio/Entrez/DTDs/isoamsr.ent new file mode 100644 index 0000000..2251ef1 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isoamsr.ent @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isobox.ent b/code/lib/Bio/Entrez/DTDs/isobox.ent new file mode 100644 index 0000000..05e2b13 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isobox.ent @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isocyr1.ent b/code/lib/Bio/Entrez/DTDs/isocyr1.ent new file mode 100644 index 0000000..b4149c7 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isocyr1.ent @@ -0,0 +1,88 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isocyr2.ent b/code/lib/Bio/Entrez/DTDs/isocyr2.ent new file mode 100644 index 0000000..b038bd9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isocyr2.ent @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isodia.ent b/code/lib/Bio/Entrez/DTDs/isodia.ent new file mode 100644 index 0000000..39ccfcd --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isodia.ent @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isogrk1.ent b/code/lib/Bio/Entrez/DTDs/isogrk1.ent new file mode 100644 index 0000000..a5f52ef --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isogrk1.ent @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/code/lib/Bio/Entrez/DTDs/isogrk2.ent b/code/lib/Bio/Entrez/DTDs/isogrk2.ent new file mode 100644 index 0000000..d27cc30 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isogrk2.ent @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/code/lib/Bio/Entrez/DTDs/isogrk3.ent b/code/lib/Bio/Entrez/DTDs/isogrk3.ent new file mode 100644 index 0000000..0cbde88 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isogrk3.ent @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isogrk4.ent b/code/lib/Bio/Entrez/DTDs/isogrk4.ent new file mode 100644 index 0000000..07c4d06 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isogrk4.ent @@ -0,0 +1,69 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isolat1.ent b/code/lib/Bio/Entrez/DTDs/isolat1.ent new file mode 100644 index 0000000..43ae764 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isolat1.ent @@ -0,0 +1,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isolat2.ent b/code/lib/Bio/Entrez/DTDs/isolat2.ent new file mode 100644 index 0000000..c29b828 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isolat2.ent @@ -0,0 +1,142 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isomfrk.ent b/code/lib/Bio/Entrez/DTDs/isomfrk.ent new file mode 100644 index 0000000..0e1a943 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isomfrk.ent @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isomopf.ent b/code/lib/Bio/Entrez/DTDs/isomopf.ent new file mode 100644 index 0000000..4b26425 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isomopf.ent @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isomscr.ent b/code/lib/Bio/Entrez/DTDs/isomscr.ent new file mode 100644 index 0000000..a2174f0 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isomscr.ent @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isonum.ent b/code/lib/Bio/Entrez/DTDs/isonum.ent new file mode 100644 index 0000000..79f4380 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isonum.ent @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isopub.ent b/code/lib/Bio/Entrez/DTDs/isopub.ent new file mode 100644 index 0000000..9b27b63 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isopub.ent @@ -0,0 +1,105 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/isotech.ent b/code/lib/Bio/Entrez/DTDs/isotech.ent new file mode 100644 index 0000000..d94c775 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/isotech.ent @@ -0,0 +1,182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/journalmeta.ent b/code/lib/Bio/Entrez/DTDs/journalmeta.ent new file mode 100644 index 0000000..c615e2f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/journalmeta.ent @@ -0,0 +1,341 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/link.ent b/code/lib/Bio/Entrez/DTDs/link.ent new file mode 100644 index 0000000..5481464 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/link.ent @@ -0,0 +1,510 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/list.ent b/code/lib/Bio/Entrez/DTDs/list.ent new file mode 100644 index 0000000..ab18cd9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/list.ent @@ -0,0 +1,465 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/math.ent b/code/lib/Bio/Entrez/DTDs/math.ent new file mode 100644 index 0000000..1aa543b --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/math.ent @@ -0,0 +1,329 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/mathml-in-pubmed.mod b/code/lib/Bio/Entrez/DTDs/mathml-in-pubmed.mod new file mode 100644 index 0000000..ce95673 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/mathml-in-pubmed.mod @@ -0,0 +1,151 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + + + + + + + + + + + + + + + + + + + +%ent-mmlextra; + + + +%ent-mmlalias; + + +%isobox; +%isocyr1; +%isocyr2; +%isodia; +%isolat1; +%isolat2; +%isonum; +%isopub; +%isoamsa; +%isoamsb; +%isoamsc; +%isoamsn; +%isoamso; +%isoamsr; +%isogrk3; +%isomfrk; +%isomopf; +%isomscr; +%isotech; + + + + + + + + + + + + + + +%mathml.dtd; diff --git a/code/lib/Bio/Entrez/DTDs/mathml2-qname-1.mod b/code/lib/Bio/Entrez/DTDs/mathml2-qname-1.mod new file mode 100644 index 0000000..92a7621 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/mathml2-qname-1.mod @@ -0,0 +1 @@ + ]]> ]]> ]]> \ No newline at end of file diff --git a/code/lib/Bio/Entrez/DTDs/mathml2.dtd b/code/lib/Bio/Entrez/DTDs/mathml2.dtd new file mode 100644 index 0000000..ddd60eb --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/mathml2.dtd @@ -0,0 +1,1960 @@ + + + + + + + + + +%mathml-qname.mod;]]> + + + +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%ent-mmlextra; + + + + +%ent-mmlalias; + +]]> + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/mathml3-qname1.mod b/code/lib/Bio/Entrez/DTDs/mathml3-qname1.mod new file mode 100644 index 0000000..254bdb2 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/mathml3-qname1.mod @@ -0,0 +1,294 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + + +]]> + + + + +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/mathml3.dtd b/code/lib/Bio/Entrez/DTDs/mathml3.dtd new file mode 100644 index 0000000..3a8886e --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/mathml3.dtd @@ -0,0 +1,1682 @@ + + + + + + + + + + + + + + +%mathml-qname.mod;]]> + + + +]]> + + + + + + +%isobox; + +%isocyr1; + +%isocyr2; + +%isodia; + +%isolat1; + +%isolat2; + +%isonum; + +%isopub; + +%isoamsa; + +%isoamsb; + +%isoamsc; + +%isoamsn; + +%isoamso; + +%isoamsr; + +%isogrk3; + +%isomfrk; + +%isomopf; + +%isomscr; + +%isotech; + +%mmlextra; + +%mmlalias; + +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/mathmlsetup.ent b/code/lib/Bio/Entrez/DTDs/mathmlsetup.ent new file mode 100644 index 0000000..76215a5 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/mathmlsetup.ent @@ -0,0 +1,191 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + + + + + + + + + + + +%mathml.dtd; + + + diff --git a/code/lib/Bio/Entrez/DTDs/mmlalias.ent b/code/lib/Bio/Entrez/DTDs/mmlalias.ent new file mode 100644 index 0000000..1371af3 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/mmlalias.ent @@ -0,0 +1,564 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/mmlextra.ent b/code/lib/Bio/Entrez/DTDs/mmlextra.ent new file mode 100644 index 0000000..850c7e7 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/mmlextra.ent @@ -0,0 +1,122 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/modules.ent b/code/lib/Bio/Entrez/DTDs/modules.ent new file mode 100644 index 0000000..5d8b7a6 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/modules.ent @@ -0,0 +1,417 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlm-articleset-2.0.dtd b/code/lib/Bio/Entrez/DTDs/nlm-articleset-2.0.dtd new file mode 100644 index 0000000..f82c149 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlm-articleset-2.0.dtd @@ -0,0 +1,271 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%archive-article; + diff --git a/code/lib/Bio/Entrez/DTDs/nlmcatalogrecordset_170601.dtd b/code/lib/Bio/Entrez/DTDs/nlmcatalogrecordset_170601.dtd new file mode 100644 index 0000000..85f6cbe --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmcatalogrecordset_170601.dtd @@ -0,0 +1,280 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmcommon_011101.dtd b/code/lib/Bio/Entrez/DTDs/nlmcommon_011101.dtd new file mode 100644 index 0000000..a092651 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmcommon_011101.dtd @@ -0,0 +1,175 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmcommon_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmcommon_080101.dtd new file mode 100644 index 0000000..ac0ae02 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmcommon_080101.dtd @@ -0,0 +1,201 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmcommon_090101.dtd b/code/lib/Bio/Entrez/DTDs/nlmcommon_090101.dtd new file mode 100644 index 0000000..787129b --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmcommon_090101.dtd @@ -0,0 +1,220 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedline_011101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedline_011101.dtd new file mode 100644 index 0000000..1c5aa06 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedline_011101.dtd @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + +%MedlineCitation; + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedline_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedline_080101.dtd new file mode 100644 index 0000000..1f935d1 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedline_080101.dtd @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + +%MedlineCitation; + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedline_090101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedline_090101.dtd new file mode 100644 index 0000000..d903ebd --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedline_090101.dtd @@ -0,0 +1,74 @@ + + + + + + + + + + + + + + + + + + + + + + +%MedlineCitation; + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_011101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_011101.dtd new file mode 100644 index 0000000..6b8a447 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_011101.dtd @@ -0,0 +1,178 @@ + + + + + + + + + + +%NlmCommon; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_080101.dtd new file mode 100644 index 0000000..670005a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_080101.dtd @@ -0,0 +1,107 @@ + + + + + + + + + + + + + + + + + + +%NlmSharedCatCit; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_090101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_090101.dtd new file mode 100644 index 0000000..1987031 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_090101.dtd @@ -0,0 +1,112 @@ + + + + + + + + + + + + + + + + + +%NlmSharedCatCit; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100101.dtd new file mode 100644 index 0000000..16fc7fa --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100101.dtd @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100301.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100301.dtd new file mode 100644 index 0000000..e6b4c48 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100301.dtd @@ -0,0 +1,201 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_110101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_110101.dtd new file mode 100644 index 0000000..c520c6d --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_110101.dtd @@ -0,0 +1,197 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_120101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_120101.dtd new file mode 100644 index 0000000..6489a8c --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_120101.dtd @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130101.dtd new file mode 100644 index 0000000..cda3746 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130101.dtd @@ -0,0 +1,191 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130501.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130501.dtd new file mode 100644 index 0000000..9566d38 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130501.dtd @@ -0,0 +1,191 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_140101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_140101.dtd new file mode 100644 index 0000000..d8238d9 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_140101.dtd @@ -0,0 +1,190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_150101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_150101.dtd new file mode 100644 index 0000000..2d90743 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_150101.dtd @@ -0,0 +1,189 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmserials_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmserials_080101.dtd new file mode 100644 index 0000000..32968d4 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmserials_080101.dtd @@ -0,0 +1,134 @@ + + + + + + + + + + + + + + + + + + + + + +%NlmCommon; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmserials_100101.dtd b/code/lib/Bio/Entrez/DTDs/nlmserials_100101.dtd new file mode 100644 index 0000000..d4693b0 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmserials_100101.dtd @@ -0,0 +1,157 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_080101.dtd new file mode 100644 index 0000000..f7397ea --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_080101.dtd @@ -0,0 +1,80 @@ + + + + + + + + + + +%NlmCommon; + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_090101.dtd b/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_090101.dtd new file mode 100644 index 0000000..2a41d3e --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_090101.dtd @@ -0,0 +1,80 @@ + + + + + + + + + + +%NlmCommon; + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/notat.ent b/code/lib/Bio/Entrez/DTDs/notat.ent new file mode 100644 index 0000000..6294521 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/notat.ent @@ -0,0 +1,172 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/para.ent b/code/lib/Bio/Entrez/DTDs/para.ent new file mode 100644 index 0000000..9838a43 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/para.ent @@ -0,0 +1,420 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/phrase.ent b/code/lib/Bio/Entrez/DTDs/phrase.ent new file mode 100644 index 0000000..b08987a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/phrase.ent @@ -0,0 +1,278 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/pmc-1.dtd b/code/lib/Bio/Entrez/DTDs/pmc-1.dtd new file mode 100644 index 0000000..db84036 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pmc-1.dtd @@ -0,0 +1,900 @@ + + + + + + + + + +%PMCEntities; %ISO8879ent; %ISO9573ent; + + + + + + + + + + + + + + + + + + + + + + + + + + + +%supp_data_dtd; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%mathmlsetup.ent; + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_020114.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_020114.dtd new file mode 100644 index 0000000..1538918 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_020114.dtd @@ -0,0 +1,61 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_080101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_080101.dtd new file mode 100644 index 0000000..11d6184 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_080101.dtd @@ -0,0 +1,71 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_090101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_090101.dtd new file mode 100644 index 0000000..ea1ea8f --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_090101.dtd @@ -0,0 +1,71 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_100101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_100101.dtd new file mode 100644 index 0000000..62b71f8 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_100101.dtd @@ -0,0 +1,72 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_100301.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_100301.dtd new file mode 100644 index 0000000..adc5272 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_100301.dtd @@ -0,0 +1,79 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%Bookdoc; + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_110101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_110101.dtd new file mode 100644 index 0000000..6298eb6 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_110101.dtd @@ -0,0 +1,79 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%Bookdoc; + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_120101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_120101.dtd new file mode 100644 index 0000000..dacafb8 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_120101.dtd @@ -0,0 +1,79 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%Bookdoc; + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_130101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_130101.dtd new file mode 100644 index 0000000..82bd9c6 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_130101.dtd @@ -0,0 +1,79 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%Bookdoc; + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_130501.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_130501.dtd new file mode 100644 index 0000000..a3c640c --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_130501.dtd @@ -0,0 +1,79 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%Bookdoc; + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_140101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_140101.dtd new file mode 100644 index 0000000..4570815 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_140101.dtd @@ -0,0 +1,79 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%Bookdoc; + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_150101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_150101.dtd new file mode 100644 index 0000000..7c0933d --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_150101.dtd @@ -0,0 +1,79 @@ + + + + + +%Medline; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%Bookdoc; + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_180101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_180101.dtd new file mode 100644 index 0000000..48bec8a --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_180101.dtd @@ -0,0 +1,434 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_180601.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_180601.dtd new file mode 100644 index 0000000..aab61e7 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_180601.dtd @@ -0,0 +1,454 @@ + + + + + + + +%mathml-in-pubmed; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_190101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_190101.dtd new file mode 100644 index 0000000..a1cd167 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/pubmed_190101.dtd @@ -0,0 +1,478 @@ + + + + + + + +%mathml-in-pubmed; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/references.ent b/code/lib/Bio/Entrez/DTDs/references.ent new file mode 100644 index 0000000..9e63a18 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/references.ent @@ -0,0 +1,726 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/section.ent b/code/lib/Bio/Entrez/DTDs/section.ent new file mode 100644 index 0000000..1623ac7 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/section.ent @@ -0,0 +1,220 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/taxon.dtd b/code/lib/Bio/Entrez/DTDs/taxon.dtd new file mode 100644 index 0000000..fadf481 --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/taxon.dtd @@ -0,0 +1,131 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/DTDs/xmlspecchars.ent b/code/lib/Bio/Entrez/DTDs/xmlspecchars.ent new file mode 100644 index 0000000..d9914bf --- /dev/null +++ b/code/lib/Bio/Entrez/DTDs/xmlspecchars.ent @@ -0,0 +1,290 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%ISOlat1; +%ISOlat2; +%ISObox; +%ISOdia; +%ISOnum; +%ISOpub; +%ISOtech; +%ISOgrk1; +%ISOgrk2; +%ISOgrk3; +%ISOgrk4; +%ISOcyr1; +%ISOcyr2; +%ISOamsa; +%ISOamsb; +%ISOamsc; +%ISOamsn; +%ISOamso; +%ISOamsr; +%ISOmscr; +%ISOmfrk; +%ISOmopf; + + + + + + diff --git a/code/lib/Bio/Entrez/Parser.py b/code/lib/Bio/Entrez/Parser.py new file mode 100644 index 0000000..98ed876 --- /dev/null +++ b/code/lib/Bio/Entrez/Parser.py @@ -0,0 +1,1005 @@ +# Copyright 2008-2014 by Michiel de Hoon. All rights reserved. +# Revisions copyright 2008-2015 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Parser for XML results returned by NCBI's Entrez Utilities. + +This parser is used by the read() function in Bio.Entrez, and is not +intended be used directly. + +The question is how to represent an XML file as Python objects. Some +XML files returned by NCBI look like lists, others look like dictionaries, +and others look like a mix of lists and dictionaries. + +My approach is to classify each possible element in the XML as a plain +string, an integer, a list, a dictionary, or a structure. The latter is a +dictionary where the same key can occur multiple times; in Python, it is +represented as a dictionary where that key occurs once, pointing to a list +of values found in the XML file. + +The parser then goes through the XML and creates the appropriate Python +object for each element. The different levels encountered in the XML are +preserved on the Python side. So a subelement of a subelement of an element +is a value in a dictionary that is stored in a list which is a value in +some other dictionary (or a value in a list which itself belongs to a list +which is a value in a dictionary, and so on). Attributes encountered in +the XML are stored as a dictionary in a member .attributes of each element, +and the tag name is saved in a member .tag. + +To decide which kind of Python object corresponds to each element in the +XML, the parser analyzes the DTD referred at the top of (almost) every +XML file returned by the Entrez Utilities. This is preferred over a hand- +written solution, since the number of DTDs is rather large and their +contents may change over time. About half the code in this parser deals +with parsing the DTD, and the other half with the XML itself. +""" +import os +import warnings +from collections import Counter +from xml.parsers import expat +from io import BytesIO +import xml.etree.ElementTree as ET +from xml.sax.saxutils import escape + +from urllib.request import urlopen, urlparse + + +# The following four classes are used to add a member .attributes to integers, +# strings, lists, and dictionaries, respectively. + + +class NoneElement: + """NCBI Entrez XML element mapped to None.""" + + def __init__(self, tag, attributes, key=None): + """Create a NoneElement.""" + self.tag = tag + if key is None: + self.key = tag + else: + self.key = key + self.attributes = attributes + + def __eq__(self, other): + """Define equality with other None objects.""" + if other is None: + return True + elif other.__eq__(None): + return True + else: + return False + + def __ne__(self, other): + """Define non-equality.""" + if other is None: + return False + elif other.__eq__(None): + return False + else: + return True + + def __repr__(self): + """Return a string representation of the object.""" + try: + attributes = self.attributes + except AttributeError: + return "NoneElement" + return "NoneElement(attributes=%r)" % attributes + + +class IntegerElement(int): + """NCBI Entrez XML element mapped to an integer.""" + + def __new__(cls, value, tag, attributes, key=None): + """Create an IntegerElement.""" + self = int.__new__(cls, value) + self.tag = tag + if key is None: + self.key = tag + else: + self.key = key + self.attributes = attributes + return self + + def __repr__(self): + """Return a string representation of the object.""" + text = int.__repr__(self) + try: + attributes = self.attributes + except AttributeError: + return text + return "IntegerElement(%s, attributes=%r)" % (text, attributes) + + +class StringElement(str): + """NCBI Entrez XML element mapped to a string.""" + + def __new__(cls, value, tag, attributes, key=None): + """Create a StringElement.""" + self = str.__new__(cls, value) + self.tag = tag + if key is None: + self.key = tag + else: + self.key = key + self.attributes = attributes + return self + + def __repr__(self): + """Return a string representation of the object.""" + text = str.__repr__(self) + attributes = self.attributes + if not attributes: + return text + return "StringElement(%s, attributes=%r)" % (text, attributes) + + +class ListElement(list): + """NCBI Entrez XML element mapped to a list.""" + + def __init__(self, tag, attributes, allowed_tags, key=None): + """Create a ListElement.""" + self.tag = tag + if key is None: + self.key = tag + else: + self.key = key + self.attributes = attributes + self.allowed_tags = allowed_tags + + def __repr__(self): + """Return a string representation of the object.""" + text = list.__repr__(self) + attributes = self.attributes + if not attributes: + return text + return "ListElement(%s, attributes=%r)" % (text, attributes) + + def store(self, value): + """Append an element to the list, checking tags.""" + key = value.key + if self.allowed_tags is not None and key not in self.allowed_tags: + raise ValueError("Unexpected item '%s' in list" % key) + self.append(value) + + +class DictionaryElement(dict): + """NCBI Entrez XML element mapped to a dictionaray.""" + + def __init__(self, tag, attrs, allowed_tags, repeated_tags=None, key=None): + """Create a DictionaryElement.""" + self.tag = tag + if key is None: + self.key = tag + else: + self.key = key + self.attributes = attrs + self.allowed_tags = allowed_tags + self.repeated_tags = repeated_tags + if repeated_tags: + for key in repeated_tags: + self[key] = [] + + def __repr__(self): + """Return a string representation of the object.""" + text = dict.__repr__(self) + attributes = self.attributes + if not attributes: + return text + return "DictElement(%s, attributes=%r)" % (text, attributes) + + def store(self, value): + """Add an entry to the dictionary, checking tags.""" + key = value.key + tag = value.tag + if self.allowed_tags is not None and tag not in self.allowed_tags: + raise ValueError("Unexpected item '%s' in dictionary" % key) + if self.repeated_tags and key in self.repeated_tags: + self[key].append(value) + else: + self[key] = value + + +class NotXMLError(ValueError): + """Failed to parse file as XML.""" + + def __init__(self, message): + """Initialize the class.""" + self.msg = message + + def __str__(self): + """Return a string summary of the exception.""" + return ( + "Failed to parse the XML data (%s). Please make sure that the input data " + "are in XML format." % self.msg + ) + + +class CorruptedXMLError(ValueError): + """Corrupted XML.""" + + def __init__(self, message): + """Initialize the class.""" + self.msg = message + + def __str__(self): + """Return a string summary of the exception.""" + return ( + "Failed to parse the XML data (%s). Please make sure that the input data " + "are not corrupted." % self.msg + ) + + +class ValidationError(ValueError): + """XML tag found which was not defined in the DTD. + + Validating parsers raise this error if the parser finds a tag in the XML + that is not defined in the DTD. Non-validating parsers do not raise this + error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating + parsers by default (see those functions for more information). + """ + + def __init__(self, name): + """Initialize the class.""" + self.name = name + + def __str__(self): + """Return a string summary of the exception.""" + return ( + "Failed to find tag '%s' in the DTD. To skip all tags that " + "are not represented in the DTD, please call Bio.Entrez.read " + "or Bio.Entrez.parse with validate=False." % self.name + ) + + +class DataHandlerMeta(type): + """A metaclass is needed until Python supports @classproperty.""" + + def __init__(cls, *args, **kwargs): + """Initialize the class.""" + cls._directory = None + + @property + def directory(cls): + """Directory for caching XSD and DTD files.""" + return cls._directory + + @directory.setter + def directory(cls, value): + """Set a custom directory for the local DTD/XSD directories.""" + if value is None: + import platform + + if platform.system() == "Windows": + value = os.path.join(os.getenv("APPDATA"), "biopython") + else: # Unix/Linux/Mac + home = os.path.expanduser("~") + value = os.path.join(home, ".config", "biopython") + cls._directory = value + # Create DTD local directory + cls.local_dtd_dir = os.path.join(cls._directory, "Bio", "Entrez", "DTDs") + os.makedirs(cls.local_dtd_dir, exist_ok=True) + # Create XSD local directory + cls.local_xsd_dir = os.path.join(cls._directory, "Bio", "Entrez", "XSDs") + os.makedirs(cls.local_xsd_dir, exist_ok=True) + + +class DataHandler(metaclass=DataHandlerMeta): + """Data handler for parsing NCBI XML from Entrez.""" + + from lib.Bio import Entrez + + global_dtd_dir = os.path.join(Entrez.__path__[0], "DTDs") + global_xsd_dir = os.path.join(Entrez.__path__[0], "XSDs") + local_dtd_dir = "" + local_xsd_dir = "" + + del Entrez + + def __init__(self, validate, escape): + """Create a DataHandler object.""" + self.dtd_urls = [] + self.element = None + self.level = 0 + self.data = [] + self.attributes = None + self.allowed_tags = None + self.strings = {} + self.lists = {} + self.dictionaries = {} + self.items = set() + self.errors = set() + self.validating = validate + self.parser = expat.ParserCreate(namespace_separator=" ") + self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) + self.parser.XmlDeclHandler = self.xmlDeclHandler + self.schema_namespace = None + self.namespace_level = Counter() + self.namespace_prefix = {} + if escape: + self.characterDataHandler = self.characterDataHandlerEscape + else: + self.characterDataHandler = self.characterDataHandlerRaw + + def read(self, handle): + """Set up the parser and let it parse the XML results.""" + # Expat's parser.ParseFile function only accepts binary data; + # see also the comment below for Entrez.parse. + if handle.read(0) != b"": + raise TypeError("file should be opened in binary mode") + try: + self.parser.ParseFile(handle) + except expat.ExpatError as e: + if self.parser.StartElementHandler: + # We saw the initial = 2: + # Then the first record is finished, while the second record + # is still a work in progress. + record = records.pop(0) + yield record + + # We have reached the end of the XML file + self.parser = None + if self.element is not None: + # No more XML data, but there is still some unfinished business + raise CorruptedXMLError("Premature end of data") + + # Send out the remaining records + yield from records + + def xmlDeclHandler(self, version, encoding, standalone): + """Set XML handlers when an XML declaration is found.""" + self.parser.CharacterDataHandler = self.characterDataHandler + self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler + self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler + self.parser.EndNamespaceDeclHandler = self.endNamespaceDeclHandler + self.parser.StartElementHandler = self.handleMissingDocumentDefinition + + def handleMissingDocumentDefinition(self, tag, attrs): + """Raise an Exception if neither a DTD nor an XML Schema is found.""" + raise ValueError( + "As the XML data contained neither a Document Type Definition (DTD) nor an XML Schema, Bio.Entrez is unable to parse these data. We recommend using a generic XML parser from the Python standard library instead, for example ElementTree." + ) + + def startNamespaceDeclHandler(self, prefix, uri): + """Handle start of an XML namespace declaration.""" + if prefix == "xsi": + # This is an xml schema + self.schema_namespace = uri + self.parser.StartElementHandler = self.schemaHandler + else: + # Note that the DTD for MathML specifies a default attribute + # that declares the namespace for each MathML element. This means + # that MathML element in the XML has an invisible MathML namespace + # declaration that triggers a call to startNamespaceDeclHandler + # and endNamespaceDeclHandler. Therefore we need to count how often + # startNamespaceDeclHandler and endNamespaceDeclHandler were called + # to find out their first and last invocation for each namespace. + if prefix == "mml": + assert uri == "http://www.w3.org/1998/Math/MathML" + elif prefix == "xlink": + assert uri == "http://www.w3.org/1999/xlink" + else: + raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri)) + self.namespace_level[prefix] += 1 + self.namespace_prefix[uri] = prefix + + def endNamespaceDeclHandler(self, prefix): + """Handle end of an XML namespace declaration.""" + if prefix != "xsi": + self.namespace_level[prefix] -= 1 + if self.namespace_level[prefix] == 0: + for key, value in self.namespace_prefix.items(): + if value == prefix: + break + else: + raise RuntimeError("Failed to find namespace prefix") + del self.namespace_prefix[key] + + def schemaHandler(self, name, attrs): + """Process the XML schema (before processing the element).""" + key = "%s noNamespaceSchemaLocation" % self.schema_namespace + schema = attrs[key] + handle = self.open_xsd_file(os.path.basename(schema)) + # if there is no local xsd file grab the url and parse the file + if not handle: + handle = urlopen(schema) + text = handle.read() + self.save_xsd_file(os.path.basename(schema), text) + handle.close() + self.parse_xsd(ET.fromstring(text)) + else: + self.parse_xsd(ET.fromstring(handle.read())) + handle.close() + # continue handling the element + self.startElementHandler(name, attrs) + # reset the element handler + self.parser.StartElementHandler = self.startElementHandler + + def startElementHandler(self, tag, attrs): + """Handle start of an XML element.""" + if tag in self.items: + assert tag == "Item" + name = attrs["Name"] + itemtype = attrs["Type"] + del attrs["Type"] + if itemtype == "Structure": + del attrs["Name"] + element = DictionaryElement( + name, attrs, allowed_tags=None, repeated_tags=None + ) + parent = self.element + element.parent = parent + # For consistency with lists below, store the element here + if parent is None: + self.record = element + else: + parent.store(element) + self.element = element + self.parser.EndElementHandler = self.endElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + elif name in ("ArticleIds", "History"): + del attrs["Name"] + allowed_tags = None # allowed tags are unknown + repeated_tags = frozenset(["pubmed", "medline"]) + element = DictionaryElement( + tag, + attrs, + allowed_tags=allowed_tags, + repeated_tags=repeated_tags, + key=name, + ) + parent = self.element + element.parent = parent + # For consistency with lists below, store the element here + if parent is None: + self.record = element + else: + parent.store(element) + self.element = element + self.parser.EndElementHandler = self.endElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + elif itemtype == "List": + del attrs["Name"] + allowed_tags = None # allowed tags are unknown + element = ListElement(tag, attrs, allowed_tags, name) + parent = self.element + element.parent = parent + if self.element is None: + # Set self.record here to let Entrez.parse iterate over it + self.record = element + else: + parent.store(element) + self.element = element + self.parser.EndElementHandler = self.endElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + elif itemtype == "Integer": + self.parser.EndElementHandler = self.endIntegerElementHandler + self.parser.CharacterDataHandler = self.characterDataHandler + self.attributes = attrs + elif itemtype in ("String", "Unknown", "Date", "Enumerator"): + assert self.attributes is None + self.attributes = attrs + self.parser.StartElementHandler = self.startRawElementHandler + self.parser.EndElementHandler = self.endStringElementHandler + self.parser.CharacterDataHandler = self.characterDataHandler + else: + raise ValueError("Unknown item type %s" % name) + elif tag in self.errors: + self.parser.EndElementHandler = self.endErrorElementHandler + self.parser.CharacterDataHandler = self.characterDataHandler + elif tag in self.strings: + self.parser.StartElementHandler = self.startRawElementHandler + self.parser.EndElementHandler = self.endStringElementHandler + self.parser.CharacterDataHandler = self.characterDataHandler + assert self.allowed_tags is None + self.allowed_tags = self.strings[tag] + assert self.attributes is None + self.attributes = attrs + elif tag in self.dictionaries: + allowed_tags, repeated_tags = self.dictionaries[tag] + element = DictionaryElement(tag, attrs, allowed_tags, repeated_tags) + parent = self.element + element.parent = parent + # For consistency with lists below, store the element here + if parent is None: + self.record = element + else: + parent.store(element) + self.element = element + self.parser.EndElementHandler = self.endElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + elif tag in self.lists: + allowed_tags = self.lists[tag] + element = ListElement(tag, attrs, allowed_tags) + parent = self.element + element.parent = parent + if parent is None: + # Set self.record here to let Entrez.parse iterate over it + self.record = element + else: + parent.store(element) + self.element = element + self.parser.EndElementHandler = self.endElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + else: + # Element not found in DTD + if self.validating: + raise ValidationError(tag) + else: + # this will not be stored in the record + self.parser.StartElementHandler = self.startSkipElementHandler + self.parser.EndElementHandler = self.endSkipElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + self.level = 1 + + def startRawElementHandler(self, name, attrs): + """Handle start of an XML raw element.""" + # check if the name is in a namespace + prefix = None + if self.namespace_prefix: + try: + uri, name = name.split() + except ValueError: + pass + else: + prefix = self.namespace_prefix[uri] + if self.namespace_level[prefix] == 1: + attrs = {"xmlns": uri} + if prefix: + key = "%s:%s" % (prefix, name) + else: + key = name + # self.allowed_tags is ignored for now. Anyway we know what to do + # with this tag. + tag = "<%s" % name + for key, value in attrs.items(): + tag += ' %s="%s"' % (key, value) + tag += ">" + self.data.append(tag) + self.parser.EndElementHandler = self.endRawElementHandler + self.level += 1 + + def startSkipElementHandler(self, name, attrs): + """Handle start of an XML skip element.""" + self.level += 1 + + def endStringElementHandler(self, tag): + """Handle end of an XML string element.""" + element = self.element + if element is not None: + self.parser.StartElementHandler = self.startElementHandler + self.parser.EndElementHandler = self.endElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + value = "".join(self.data) + self.data = [] + attributes = self.attributes + self.attributes = None + if tag in self.items: + assert tag == "Item" + key = attributes["Name"] + del attributes["Name"] + else: + key = tag + value = StringElement(value, tag, attributes, key) + if element is None: + self.record = element + else: + element.store(value) + self.allowed_tags = None + + def endRawElementHandler(self, name): + """Handle start of an XML raw element.""" + self.level -= 1 + if self.level == 0: + self.parser.EndElementHandler = self.endStringElementHandler + if self.namespace_prefix: + try: + uri, name = name.split() + except ValueError: + pass + tag = "" % name + self.data.append(tag) + + def endSkipElementHandler(self, name): + """Handle start of an XML skip element.""" + self.level -= 1 + if self.level == 0: + self.parser.StartElementHandler = self.startElementHandler + self.parser.EndElementHandler = self.endElementHandler + + def endErrorElementHandler(self, name): + """Handle start of an XML error element.""" + if self.data: + # error found: + value = "".join(self.data) + raise RuntimeError(value) + # no error found: + if self.element is not None: + self.parser.EndElementHandler = self.endElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + + def endElementHandler(self, name): + """Handle end of an XML element.""" + element = self.element + self.element = element.parent + del element.parent + + def endIntegerElementHandler(self, tag): + """Handle end of an XML integer element.""" + attributes = self.attributes + self.attributes = None + assert tag == "Item" + key = attributes["Name"] + del attributes["Name"] + if self.data: + value = int("".join(self.data)) + self.data = [] + value = IntegerElement(value, tag, attributes, key) + else: + value = NoneElement(tag, attributes, key) + element = self.element + if element is None: + self.record = value + else: + self.parser.EndElementHandler = self.endElementHandler + self.parser.CharacterDataHandler = self.skipCharacterDataHandler + if value is None: + return + element.store(value) + + def characterDataHandlerRaw(self, content): + """Handle character data as-is (raw).""" + self.data.append(content) + + def characterDataHandlerEscape(self, content): + """Handle character data by encoding it.""" + content = escape(content) + self.data.append(content) + + def skipCharacterDataHandler(self, content): + """Handle character data by skipping it.""" + + def parse_xsd(self, root): + """Parse an XSD file.""" + prefix = "{http://www.w3.org/2001/XMLSchema}" + for element in root: + isSimpleContent = False + attribute_keys = [] + keys = [] + multiple = [] + assert element.tag == prefix + "element" + name = element.attrib["name"] + assert len(element) == 1 + complexType = element[0] + assert complexType.tag == prefix + "complexType" + for component in complexType: + tag = component.tag + if tag == prefix + "attribute": + # we could distinguish by type; keeping string for now + attribute_keys.append(component.attrib["name"]) + elif tag == prefix + "sequence": + maxOccurs = component.attrib.get("maxOccurs", "1") + for key in component: + assert key.tag == prefix + "element" + ref = key.attrib["ref"] + keys.append(ref) + if maxOccurs != "1" or key.attrib.get("maxOccurs", "1") != "1": + multiple.append(ref) + elif tag == prefix + "simpleContent": + assert len(component) == 1 + extension = component[0] + assert extension.tag == prefix + "extension" + assert extension.attrib["base"] == "xs:string" + for attribute in extension: + assert attribute.tag == prefix + "attribute" + # we could distinguish by type; keeping string for now + attribute_keys.append(attribute.attrib["name"]) + isSimpleContent = True + allowed_tags = frozenset(keys) + if len(keys) == 1 and keys == multiple: + assert not isSimpleContent + self.lists[name] = allowed_tags + elif len(keys) >= 1: + assert not isSimpleContent + repeated_tags = frozenset(multiple) + self.dictionaries[name] = (allowed_tags, repeated_tags) + else: + self.strings[name] = allowed_tags + + def elementDecl(self, name, model): + """Call a call-back function for each element declaration in a DTD. + + This is used for each element declaration in a DTD like:: + + + + The purpose of this function is to determine whether this element + should be regarded as a string, integer, list, dictionary, structure, + or error. + """ + if name.upper() == "ERROR": + self.errors.add(name) + return + if name == "Item" and model == ( + expat.model.XML_CTYPE_MIXED, + expat.model.XML_CQUANT_REP, + None, + ((expat.model.XML_CTYPE_NAME, expat.model.XML_CQUANT_NONE, "Item", ()),), + ): + # Special case. As far as I can tell, this only occurs in the + # eSummary DTD. + self.items.add(name) + return + # First, remove ignorable parentheses around declarations + while ( + model[0] in (expat.model.XML_CTYPE_SEQ, expat.model.XML_CTYPE_CHOICE) + and model[1] in (expat.model.XML_CQUANT_NONE, expat.model.XML_CQUANT_OPT) + and len(model[3]) == 1 + ): + model = model[3][0] + # PCDATA declarations correspond to strings + if model[0] in (expat.model.XML_CTYPE_MIXED, expat.model.XML_CTYPE_EMPTY): + if model[1] == expat.model.XML_CQUANT_REP: + children = model[3] + allowed_tags = frozenset(child[2] for child in children) + else: + allowed_tags = frozenset() + self.strings[name] = allowed_tags + return + # List-type elements + if model[0] in ( + expat.model.XML_CTYPE_CHOICE, + expat.model.XML_CTYPE_SEQ, + ) and model[1] in (expat.model.XML_CQUANT_PLUS, expat.model.XML_CQUANT_REP): + children = model[3] + if model[0] == expat.model.XML_CTYPE_SEQ: + assert len(children) == 1 + allowed_tags = frozenset(child[2] for child in children) + self.lists[name] = allowed_tags + return + # This is the tricky case. Check which keys can occur multiple + # times. If only one key is possible, and it can occur multiple + # times, then this is a list. If more than one key is possible, + # but none of them can occur multiple times, then this is a + # dictionary. Otherwise, this is a structure. + # In 'single' and 'multiple', we keep track which keys can occur + # only once, and which can occur multiple times. + single = [] + multiple = [] + # The 'count' function is called recursively to make sure all the + # children in this model are counted. Error keys are ignored; + # they raise an exception in Python. + + def count(model): + quantifier, key, children = model[1:] + if key is None: + if quantifier in ( + expat.model.XML_CQUANT_PLUS, + expat.model.XML_CQUANT_REP, + ): + for child in children: + multiple.append(child[2]) + else: + for child in children: + count(child) + elif key.upper() != "ERROR": + if quantifier in ( + expat.model.XML_CQUANT_NONE, + expat.model.XML_CQUANT_OPT, + ): + single.append(key) + elif quantifier in ( + expat.model.XML_CQUANT_PLUS, + expat.model.XML_CQUANT_REP, + ): + multiple.append(key) + + count(model) + if len(single) == 0 and len(multiple) == 1: + allowed_tags = frozenset(multiple) + self.lists[name] = allowed_tags + else: + allowed_tags = frozenset(single + multiple) + repeated_tags = frozenset(multiple) + self.dictionaries[name] = (allowed_tags, repeated_tags) + + def open_dtd_file(self, filename): + """Open specified DTD file.""" + path = os.path.join(DataHandler.local_dtd_dir, filename) + try: + handle = open(path, "rb") + except FileNotFoundError: + pass + else: + return handle + path = os.path.join(DataHandler.global_dtd_dir, filename) + try: + handle = open(path, "rb") + except FileNotFoundError: + pass + else: + return handle + return None + + def open_xsd_file(self, filename): + """Open specified XSD file.""" + path = os.path.join(DataHandler.local_xsd_dir, filename) + try: + handle = open(path, "rb") + except FileNotFoundError: + pass + else: + return handle + path = os.path.join(DataHandler.global_xsd_dir, filename) + try: + handle = open(path, "rb") + except FileNotFoundError: + pass + else: + return handle + return None + + def save_dtd_file(self, filename, text): + """Save DTD file to cache.""" + path = os.path.join(DataHandler.local_dtd_dir, filename) + try: + handle = open(path, "wb") + except OSError: + warnings.warn("Failed to save %s at %s" % (filename, path)) + else: + handle.write(text) + handle.close() + + def save_xsd_file(self, filename, text): + """Save XSD file to cache.""" + path = os.path.join(DataHandler.local_xsd_dir, filename) + try: + handle = open(path, "wb") + except OSError: + warnings.warn("Failed to save %s at %s" % (filename, path)) + else: + handle.write(text) + handle.close() + + def externalEntityRefHandler(self, context, base, systemId, publicId): + """Handle external entity reference in order to cache DTD locally. + + The purpose of this function is to load the DTD locally, instead + of downloading it from the URL specified in the XML. Using the local + DTD results in much faster parsing. If the DTD is not found locally, + we try to download it. If new DTDs become available from NCBI, + putting them in Bio/Entrez/DTDs will allow the parser to see them. + """ + urlinfo = urlparse(systemId) + if urlinfo.scheme in ["http", "https", "ftp"]: + # Then this is an absolute path to the DTD. + url = systemId + elif urlinfo.scheme == "": + # Then this is a relative path to the DTD. + # Look at the parent URL to find the full path. + try: + source = self.dtd_urls[-1] + except IndexError: + # Assume the default URL for DTDs if the top parent + # does not contain an absolute path + source = "http://www.ncbi.nlm.nih.gov/dtd/" + else: + source = os.path.dirname(source) + # urls always have a forward slash, don't use os.path.join + url = source.rstrip("/") + "/" + systemId + else: + raise ValueError("Unexpected URL scheme %r" % urlinfo.scheme) + self.dtd_urls.append(url) + # First, try to load the local version of the DTD file + location, filename = os.path.split(systemId) + handle = self.open_dtd_file(filename) + if not handle: + # DTD is not available as a local file. Try accessing it through + # the internet instead. + try: + handle = urlopen(url) + except OSError: + raise RuntimeError( + "Failed to access %s at %s" % (filename, url) + ) from None + text = handle.read() + handle.close() + self.save_dtd_file(filename, text) + handle = BytesIO(text) + + parser = self.parser.ExternalEntityParserCreate(context) + parser.ElementDeclHandler = self.elementDecl + parser.ParseFile(handle) + handle.close() + self.dtd_urls.pop() + self.parser.StartElementHandler = self.startElementHandler + return 1 diff --git a/code/lib/Bio/Entrez/XSDs/IPGReportSet.xsd b/code/lib/Bio/Entrez/XSDs/IPGReportSet.xsd new file mode 100644 index 0000000..6194a26 --- /dev/null +++ b/code/lib/Bio/Entrez/XSDs/IPGReportSet.xsd @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code/lib/Bio/Entrez/__init__.py b/code/lib/Bio/Entrez/__init__.py new file mode 100644 index 0000000..03f2805 --- /dev/null +++ b/code/lib/Bio/Entrez/__init__.py @@ -0,0 +1,696 @@ +# Copyright 1999-2000 by Jeffrey Chang. All rights reserved. +# Copyright 2008-2013 by Michiel de Hoon. All rights reserved. +# Revisions copyright 2011-2016 by Peter Cock. All rights reserved. +# Revisions copyright 2015 by Eric Rasche. All rights reserved. +# Revisions copyright 2015 by Carlos Pena. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Provides code to access NCBI over the WWW. + +The main Entrez web page is available at: +http://www.ncbi.nlm.nih.gov/Entrez/ + +Entrez Programming Utilities web page is available at: +http://www.ncbi.nlm.nih.gov/books/NBK25501/ + +This module provides a number of functions like ``efetch`` (short for +Entrez Fetch) which will return the data as a handle object. This is +a standard interface used in Python for reading data from a file, or +in this case a remote network connection, and provides methods like +``.read()`` or offers iteration over the contents line by line. See +also "What the heck is a handle?" in the Biopython Tutorial and +Cookbook: http://biopython.org/DIST/docs/tutorial/Tutorial.html +http://biopython.org/DIST/docs/tutorial/Tutorial.pdf +The handle returned by these functions can be either in text mode or +in binary mode, depending on the data requested and the results +returned by NCBI Entrez. Typically, XML data will be in binary mode +while other data will be in text mode, as required by the downstream +parser to parse the data. + +Unlike a handle to a file on disk from the ``open(filename)`` function, +which has a ``.name`` attribute giving the filename, the handles from +``Bio.Entrez`` all have a ``.url`` attribute instead giving the URL +used to connect to the NCBI Entrez API. + +All the functions that send requests to the NCBI Entrez API will +automatically respect the NCBI rate limit (of 3 requests per second +without an API key, or 10 requests per second with an API key) and +will automatically retry when encountering transient failures +(i.e. connection failures or HTTP 5XX codes). By default, Biopython +does a maximum of three tries before giving up, and sleeps for 15 +seconds between tries. You can tweak these parameters by setting +``Bio.Entrez.max_tries`` and ``Bio.Entrez.sleep_between_tries``. + +The Entrez module also provides an XML parser which takes a handle +as input. + +Variables: + + - email Set the Entrez email parameter (default is not set). + - tool Set the Entrez tool parameter (default is ``biopython``). + - api_key Personal API key from NCBI. If not set, only 3 queries per + second are allowed. 10 queries per seconds otherwise with a + valid API key. + - max_tries Configures how many times failed requests will be + automatically retried on error (default is 3). + - sleep_between_tries The delay, in seconds, before retrying a request on + error (default is 15). + +Functions: + + - efetch Retrieves records in the requested format from a list of one or + more primary IDs or from the user's environment + - epost Posts a file containing a list of primary IDs for future use in + the user's environment to use with subsequent search strategies + - esearch Searches and retrieves primary IDs (for use in EFetch, ELink, + and ESummary) and term translations and optionally retains + results for future use in the user's environment. + - elink Checks for the existence of an external or Related Articles link + from a list of one or more primary IDs. Retrieves primary IDs + and relevancy scores for links to Entrez databases or Related + Articles; creates a hyperlink to the primary LinkOut provider + for a specific ID and database, or lists LinkOut URLs + and Attributes for multiple IDs. + - einfo Provides field index term counts, last update, and available + links for each database. + - esummary Retrieves document summaries from a list of primary IDs or from + the user's environment. + - egquery Provides Entrez database counts in XML for a single search + using Global Query. + - espell Retrieves spelling suggestions. + - ecitmatch Retrieves PubMed IDs (PMIDs) that correspond to a set of + input citation strings. + + - read Parses the XML results returned by any of the above functions. + Alternatively, the XML data can be read from a file opened in binary mode. + Typical usage is: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> handle = Entrez.einfo() # or esearch, efetch, ... + >>> record = Entrez.read(handle) + >>> handle.close() + + where record is now a Python dictionary or list. + + - parse Parses the XML results returned by those of the above functions + which can return multiple records - such as efetch, esummary + and elink. Typical usage is: + + >>> handle = Entrez.esummary(db="pubmed", id="19304878,14630660", retmode="xml") + >>> records = Entrez.parse(handle) + >>> for record in records: + ... # each record is a Python dictionary or list. + ... print(record['Title']) + Biopython: freely available Python tools for computational molecular biology and bioinformatics. + PDB file parser and structure class implemented in Python. + >>> handle.close() + + This function is appropriate only if the XML file contains + multiple records, and is particular useful for large files. + + - _open Internally used function. + +""" + +import time +import warnings +import io +from urllib.error import URLError, HTTPError +from urllib.parse import urlencode +from urllib.request import urlopen + + +email = None +max_tries = 3 +sleep_between_tries = 15 +tool = "biopython" +api_key = None + + +# XXX retmode? +def epost(db, **keywds): + """Post a file of identifiers for future use. + + Posts a file containing a list of UIs for future use in the user's + environment to use with subsequent search strategies. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EPost + + Return a handle to the results. + + Raises an IOError exception if there's a network error. + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi" + variables = {"db": db} + variables.update(keywds) + return _open(cgi, variables, post=True) + + +def efetch(db, **keywords): + """Fetch Entrez results which are returned as a handle. + + EFetch retrieves records in the requested format from a list or set of one or + more UIs or from user's environment. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch + + Return a handle to the results. + + Raises an IOError exception if there's a network error. + + Short example: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> handle = Entrez.efetch(db="nucleotide", id="AY851612", rettype="gb", retmode="text") + >>> print(handle.readline().strip()) + LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007 + >>> handle.close() + + This will automatically use an HTTP POST rather than HTTP GET if there + are over 200 identifiers as recommended by the NCBI. + + **Warning:** The NCBI changed the default retmode in Feb 2012, so many + databases which previously returned text output now give XML. + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + variables = {"db": db} + variables.update(keywords) + post = False + try: + ids = variables["id"] + except KeyError: + pass + else: + try: + # ids is a single integer or a string representing a single integer + ids = str(int(ids)) + except TypeError: + # ids was not a string; try an iterable: + ids = ",".join(map(str, ids)) + except ValueError: + # string with commas or string not representing an integer + ids = ",".join(map(str, (id.strip() for id in ids.split(",")))) + + variables["id"] = ids + if ids.count(",") >= 200: + # NCBI prefers an HTTP POST instead of an HTTP GET if there are + # more than about 200 IDs + post = True + return _open(cgi, variables, post=post) + + +def esearch(db, term, **keywds): + """Run an Entrez search and return a handle to the results. + + ESearch searches and retrieves primary IDs (for use in EFetch, ELink + and ESummary) and term translations, and optionally retains results + for future use in the user's environment. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch + + Return a handle to the results which are always in XML format. + + Raises an IOError exception if there's a network error. + + Short example: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD", idtype="acc") + >>> record = Entrez.read(handle) + >>> handle.close() + >>> int(record["Count"]) >= 2 + True + >>> "EF590893.1" in record["IdList"] + True + >>> "EF590892.1" in record["IdList"] + True + + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + variables = {"db": db, "term": term} + variables.update(keywds) + return _open(cgi, variables) + + +def elink(**keywds): + """Check for linked external articles and return a handle. + + ELink checks for the existence of an external or Related Articles link + from a list of one or more primary IDs; retrieves IDs and relevancy + scores for links to Entrez databases or Related Articles; creates a + hyperlink to the primary LinkOut provider for a specific ID and + database, or lists LinkOut URLs and attributes for multiple IDs. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink + + Return a handle to the results, by default in XML format. + + Raises an IOError exception if there's a network error. + + This example finds articles related to the Biopython application + note's entry in the PubMed database: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> pmid = "19304878" + >>> handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed") + >>> record = Entrez.read(handle) + >>> handle.close() + >>> print(record[0]["LinkSetDb"][0]["LinkName"]) + pubmed_pubmed + >>> linked = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]] + >>> "17121776" in linked + True + + This is explained in much more detail in the Biopython Tutorial. + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi" + variables = {} + variables.update(keywds) + return _open(cgi, variables) + + +def einfo(**keywds): + """Return a summary of the Entrez databases as a results handle. + + EInfo provides field names, index term counts, last update, and + available links for each Entrez database. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EInfo + + Return a handle to the results, by default in XML format. + + Raises an IOError exception if there's a network error. + + Short example: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> record = Entrez.read(Entrez.einfo()) + >>> 'pubmed' in record['DbList'] + True + + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi" + variables = {} + variables.update(keywds) + return _open(cgi, variables) + + +def esummary(**keywds): + """Retrieve document summaries as a results handle. + + ESummary retrieves document summaries from a list of primary IDs or + from the user's environment. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary + + Return a handle to the results, by default in XML format. + + Raises an IOError exception if there's a network error. + + This example discovers more about entry 19923 in the structure + database: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> handle = Entrez.esummary(db="structure", id="19923") + >>> record = Entrez.read(handle) + >>> handle.close() + >>> print(record[0]["Id"]) + 19923 + >>> print(record[0]["PdbDescr"]) + Crystal Structure Of E. Coli Aconitase B + + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" + variables = {} + variables.update(keywds) + return _open(cgi, variables) + + +def egquery(**keywds): + """Provide Entrez database counts for a global search. + + EGQuery provides Entrez database counts in XML for a single search + using Global Query. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EGQuery + + Return a handle to the results in XML format. + + Raises an IOError exception if there's a network error. + + This quick example based on a longer version from the Biopython + Tutorial just checks there are over 60 matches for 'Biopython' + in PubMedCentral: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> handle = Entrez.egquery(term="biopython") + >>> record = Entrez.read(handle) + >>> handle.close() + >>> for row in record["eGQueryResult"]: + ... if "pmc" in row["DbName"]: + ... print(int(row["Count"]) > 60) + True + + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi" + variables = {} + variables.update(keywds) + return _open(cgi, variables) + + +def espell(**keywds): + """Retrieve spelling suggestions as a results handle. + + ESpell retrieves spelling suggestions, if available. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESpell + + Return a handle to the results, by default in XML format. + + Raises an IOError exception if there's a network error. + + Short example: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> record = Entrez.read(Entrez.espell(term="biopythooon")) + >>> print(record["Query"]) + biopythooon + >>> print(record["CorrectedQuery"]) + biopython + + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi" + variables = {} + variables.update(keywds) + return _open(cgi, variables) + + +def _update_ecitmatch_variables(keywds): + # XML is the only supported value, and it actually returns TXT. + variables = {"retmode": "xml"} + citation_keys = ( + "journal_title", + "year", + "volume", + "first_page", + "author_name", + "key", + ) + + # Accept pre-formatted strings + if isinstance(keywds["bdata"], str): + variables.update(keywds) + else: + # Alternatively accept a nicer interface + variables["db"] = keywds["db"] + bdata = [] + for citation in keywds["bdata"]: + formatted_citation = "|".join( + [citation.get(key, "") for key in citation_keys] + ) + bdata.append(formatted_citation) + variables["bdata"] = "\r".join(bdata) + return variables + + +def ecitmatch(**keywds): + """Retrieve PMIDs for input citation strings, returned as a handle. + + ECitMatch retrieves PubMed IDs (PMIDs) that correspond to a set of input + citation strings. + + See the online documentation for an explanation of the parameters: + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ECitMatch + + Return a handle to the results, by default in plain text + + Raises an IOError exception if there's a network error. + + Short example: + + >>> from Bio import Entrez + >>> Entrez.email = "Your.Name.Here@example.org" + >>> citation_1 = {"journal_title": "proc natl acad sci u s a", + ... "year": "1991", "volume": "88", "first_page": "3248", + ... "author_name": "mann bj", "key": "citation_1"} + >>> handle = Entrez.ecitmatch(db="pubmed", bdata=[citation_1]) + >>> print(handle.read().strip().split("|")) + ['proc natl acad sci u s a', '1991', '88', '3248', 'mann bj', 'citation_1', '2014248'] + >>> handle.close() + + """ + cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi" + variables = _update_ecitmatch_variables(keywds) + return _open(cgi, variables, ecitmatch=True) + + +def read(handle, validate=True, escape=False): + """Parse an XML file from the NCBI Entrez Utilities into python objects. + + This function parses an XML file created by NCBI's Entrez Utilities, + returning a multilevel data structure of Python lists and dictionaries. + Most XML files returned by NCBI's Entrez Utilities can be parsed by + this function, provided its DTD is available. Biopython includes the + DTDs for most commonly used Entrez Utilities. + + The handle must be in binary mode. This allows the parser to detect the + encoding from the XML file, and to use it to convert all text in the XML + to the correct Unicode string. The functions in Bio.Entrez to access NCBI + Entrez will automatically return XML data in binary mode. For files, + please use mode "rb" when opening the file, as in + + >>> from Bio import Entrez + >>> handle = open("Entrez/esearch1.xml", "rb") # opened in binary mode + >>> record = Entrez.read(handle) + >>> print(record['QueryTranslation']) + biopython[All Fields] + >>> handle.close() + + If validate is True (default), the parser will validate the XML file + against the DTD, and raise an error if the XML file contains tags that + are not represented in the DTD. If validate is False, the parser will + simply skip such tags. + + If escape is True, all characters that are not valid HTML are replaced + by HTML escape characters to guarantee that the returned strings are + valid HTML fragments. For example, a less-than sign (<) is replaced by + <. If escape is False (default), the string is returned as is. + + Whereas the data structure seems to consist of generic Python lists, + dictionaries, strings, and so on, each of these is actually a class + derived from the base type. This allows us to store the attributes + (if any) of each element in a dictionary my_element.attributes, and + the tag name in my_element.tag. + """ + from .Parser import DataHandler + + handler = DataHandler(validate, escape) + record = handler.read(handle) + return record + + +def parse(handle, validate=True, escape=False): + """Parse an XML file from the NCBI Entrez Utilities into python objects. + + This function parses an XML file created by NCBI's Entrez Utilities, + returning a multilevel data structure of Python lists and dictionaries. + This function is suitable for XML files that (in Python) can be represented + as a list of individual records. Whereas 'read' reads the complete file + and returns a single Python list, 'parse' is a generator function that + returns the records one by one. This function is therefore particularly + useful for parsing large files. + + Most XML files returned by NCBI's Entrez Utilities can be parsed by + this function, provided its DTD is available. Biopython includes the + DTDs for most commonly used Entrez Utilities. + + The handle must be in binary mode. This allows the parser to detect the + encoding from the XML file, and to use it to convert all text in the XML + to the correct Unicode string. The functions in Bio.Entrez to access NCBI + Entrez will automatically return XML data in binary mode. For files, + please use mode "rb" when opening the file, as in + + >>> from Bio import Entrez + >>> handle = open("Entrez/pubmed1.xml", "rb") # opened in binary mode + >>> records = Entrez.parse(handle) + >>> for record in records: + ... print(record['MedlineCitation']['Article']['Journal']['Title']) + ... + Social justice (San Francisco, Calif.) + Biochimica et biophysica acta + >>> handle.close() + + If validate is True (default), the parser will validate the XML file + against the DTD, and raise an error if the XML file contains tags that + are not represented in the DTD. If validate is False, the parser will + simply skip such tags. + + If escape is True, all characters that are not valid HTML are replaced + by HTML escape characters to guarantee that the returned strings are + valid HTML fragments. For example, a less-than sign (<) is replaced by + <. If escape is False (default), the string is returned as is. + + Whereas the data structure seems to consist of generic Python lists, + dictionaries, strings, and so on, each of these is actually a class + derived from the base type. This allows us to store the attributes + (if any) of each element in a dictionary my_element.attributes, and + the tag name in my_element.tag. + """ + from .Parser import DataHandler + + handler = DataHandler(validate, escape) + records = handler.parse(handle) + return records + + +def _open(cgi, params=None, post=None, ecitmatch=False): + """Build the URL and open a handle to it (PRIVATE). + + Open a handle to Entrez. cgi is the URL for the cgi script to access. + params is a dictionary with the options to pass to it. Does some + simple error checking, and will raise an IOError if it encounters one. + + The argument post should be a boolean to explicitly control if an HTTP + POST should be used rather an HTTP GET based on the query length. + By default (post=None), POST is used if the URL encoded parameters would + be over 1000 characters long. + + This function also enforces the "up to three queries per second rule" + to avoid abusing the NCBI servers. + """ + # NCBI requirement: At most three queries per second if no API key is provided. + # Equivalently, at least a third of second between queries + params = _construct_params(params) + options = _encode_options(ecitmatch, params) + # Using just 0.333333334 seconds sometimes hit the NCBI rate limit, + # the slightly longer pause of 0.37 seconds has been more reliable. + delay = 0.1 if api_key else 0.37 + current = time.time() + wait = _open.previous + delay - current + if wait > 0: + time.sleep(wait) + _open.previous = current + wait + else: + _open.previous = current + + # By default, post is None. Set to a boolean to over-ride length choice: + if post is None and len(options) > 1000: + post = True + cgi = _construct_cgi(cgi, post, options) + + for i in range(max_tries): + try: + if post: + handle = urlopen(cgi, data=options.encode("utf8")) + else: + handle = urlopen(cgi) + except HTTPError as exception: + # Reraise if the final try fails + if i >= max_tries - 1: + raise + # Reraise if the exception is triggered by a HTTP 4XX error + # indicating some kind of bad request, UNLESS it's specifically a + # 429 "Too Many Requests" response. NCBI seems to sometimes + # erroneously return 429s even when their rate limit is + # honored (and indeed even with the rate-limit-related fudging + # higher up in this function in place), so the best we can do is + # treat them as a serverside error and try again after sleeping + # for a bit. + if exception.code // 100 == 4 and exception.code != 429: + raise + except URLError: + # Reraise if the final try fails + if i >= max_tries - 1: + raise + # Treat as a transient error and try again after a brief delay: + time.sleep(sleep_between_tries) + else: + break + + subtype = handle.headers.get_content_subtype() + if subtype == "plain": + url = handle.url + handle = io.TextIOWrapper(handle, encoding="UTF-8") + handle.url = url + return handle + + +_open.previous = 0 + + +def _construct_params(params): + if params is None: + params = {} + + # Remove None values from the parameters + for key, value in list(params.items()): + if value is None: + del params[key] + # Tell Entrez that we are using Biopython (or whatever the user has + # specified explicitly in the parameters or by changing the default) + if "tool" not in params: + params["tool"] = tool + # Tell Entrez who we are + if "email" not in params: + if email is not None: + params["email"] = email + else: + warnings.warn( + """ +Email address is not specified. + +To make use of NCBI's E-utilities, NCBI requires you to specify your +email address with each request. As an example, if your email address +is A.N.Other@example.com, you can specify it as follows: + from Bio import Entrez + Entrez.email = 'A.N.Other@example.com' +In case of excessive usage of the E-utilities, NCBI will attempt to contact +a user at the email address provided before blocking access to the +E-utilities.""", + UserWarning, + ) + if api_key and "api_key" not in params: + params["api_key"] = api_key + return params + + +def _encode_options(ecitmatch, params): + # Open a handle to Entrez. + options = urlencode(params, doseq=True) + # urlencode encodes pipes, which NCBI expects in ECitMatch + if ecitmatch: + options = options.replace("%7C", "|") + return options + + +def _construct_cgi(cgi, post, options): + if not post: + # HTTP GET + cgi += "?" + options + return cgi + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Entrez/__pycache__/Parser.cpython-311.pyc b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-311.pyc new file mode 100644 index 0000000..56c87a0 Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-311.pyc differ diff --git a/code/lib/Bio/Entrez/__pycache__/Parser.cpython-312.pyc b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-312.pyc new file mode 100644 index 0000000..877b4ba Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-312.pyc differ diff --git a/code/lib/Bio/Entrez/__pycache__/Parser.cpython-37.pyc b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-37.pyc new file mode 100644 index 0000000..acf7c7a Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-37.pyc differ diff --git a/code/lib/Bio/Entrez/__pycache__/__init__.cpython-311.pyc b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..7fa3a2a Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-311.pyc differ diff --git a/code/lib/Bio/Entrez/__pycache__/__init__.cpython-312.pyc b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..d217589 Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-312.pyc differ diff --git a/code/lib/Bio/Entrez/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..a5a9670 Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/ExPASy/Enzyme.py b/code/lib/Bio/ExPASy/Enzyme.py new file mode 100644 index 0000000..6c1e5ef --- /dev/null +++ b/code/lib/Bio/ExPASy/Enzyme.py @@ -0,0 +1,157 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# Copyright 2009 by Michiel de Hoon. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Parse the enzyme.dat file from Enzyme at ExPASy. + +See https://www.expasy.org/enzyme/ + +Tested with the release of 03-Mar-2009. + +Functions: + - read Reads a file containing one ENZYME entry + - parse Reads a file containing multiple ENZYME entries + +Classes: + - Record Holds ENZYME data. + +""" + + +def parse(handle): + """Parse ENZYME records. + + This function is for parsing ENZYME files containing multiple + records. + + Arguments: + - handle - handle to the file. + + """ + while True: + record = __read(handle) + if not record: + break + yield record + + +def read(handle): + """Read one ENZYME record. + + This function is for parsing ENZYME files containing + exactly one record. + + Arguments: + - handle - handle to the file. + + """ + record = __read(handle) + # We should have reached the end of the record by now + remainder = handle.read() + if remainder: + raise ValueError("More than one ENZYME record found") + return record + + +class Record(dict): + """Holds information from an ExPASy ENZYME record as a Python dictionary. + + Each record contains the following keys: + + - ID: EC number + - DE: Recommended name + - AN: Alternative names (if any) + - CA: Catalytic activity + - CF: Cofactors (if any) + - PR: Pointers to the Prosite documentation entrie(s) that + correspond to the enzyme (if any) + - DR: Pointers to the Swiss-Prot protein sequence entrie(s) + that correspond to the enzyme (if any) + - CC: Comments + + """ + + def __init__(self): + """Initialize the class.""" + dict.__init__(self) + self["ID"] = "" + self["DE"] = "" + self["AN"] = [] + self["CA"] = "" + self["CF"] = "" + self["CC"] = [] # one comment per line + self["PR"] = [] + self["DR"] = [] + + def __repr__(self): + if self["ID"]: + if self["DE"]: + return "%s (%s, %s)" % (self.__class__.__name__, self["ID"], self["DE"]) + else: + return "%s (%s)" % (self.__class__.__name__, self["ID"]) + else: + return "%s ( )" % (self.__class__.__name__) + + def __str__(self): + output = [ + "ID: " + self["ID"], + "DE: " + self["DE"], + "AN: " + repr(self["AN"]), + "CA: '" + self["CA"] + "'", + "CF: " + self["CF"], + "CC: " + repr(self["CC"]), + "PR: " + repr(self["PR"]), + "DR: %d Records" % len(self["DR"]), + ] + return "\n".join(output) + + +# Everything below is private + + +def __read(handle): + record = None + for line in handle: + key, value = line[:2], line[5:].rstrip() + if key == "ID": + record = Record() + record["ID"] = value + elif key == "DE": + record["DE"] += value + elif key == "AN": + if record["AN"] and not record["AN"][-1].endswith("."): + record["AN"][-1] += " " + value + else: + record["AN"].append(value) + elif key == "CA": + record["CA"] += value + elif key == "DR": + pair_data = value.rstrip(";").split(";") + for pair in pair_data: + t1, t2 = pair.split(",") + row = [t1.strip(), t2.strip()] + record["DR"].append(row) + elif key == "CF": + if record["CF"]: + record["CF"] += " " + value + else: + record["CF"] = value + elif key == "PR": + assert value.startswith("PROSITE; ") + value = value[9:].rstrip(";") + record["PR"].append(value) + elif key == "CC": + if value.startswith("-!- "): + record["CC"].append(value[4:]) + elif value.startswith(" ") and record["CC"]: + record["CC"][-1] += value[3:] + # copyright notice is silently skipped + elif key == "//": + if record: + return record + else: # This was the copyright notice + continue + if record: + raise ValueError("Unexpected end of stream") diff --git a/code/lib/Bio/ExPASy/Prodoc.py b/code/lib/Bio/ExPASy/Prodoc.py new file mode 100644 index 0000000..52981a0 --- /dev/null +++ b/code/lib/Bio/ExPASy/Prodoc.py @@ -0,0 +1,173 @@ +# Copyright 2000 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code to work with the prosite.doc file from Prosite. + +See https://www.expasy.org/prosite/ + +Tested with: + - Release 15.0, July 1998 + - Release 16.0, July 1999 + - Release 20.22, 13 November 2007 + - Release 20.43, 10 February 2009 + +Functions: + - read Read a Prodoc file containing exactly one Prodoc entry. + - parse Iterates over entries in a Prodoc file. + +Classes: + - Record Holds Prodoc data. + - Reference Holds data from a Prodoc reference. + +""" + + +def read(handle): + """Read in a record from a file with exactly one Prodoc record.""" + record = __read(handle) + # We should have reached the end of the record by now + line = handle.readline() + if line: + raise ValueError("More than one Prodoc record found") + return record + + +def parse(handle): + """Iterate over the records in a Prodoc file.""" + while True: + record = __read(handle) + if not record: + return + yield record + + +class Record: + """Holds information from a Prodoc record. + + Attributes: + - accession Accession number of the record. + - prosite_refs List of tuples (prosite accession, prosite name). + - text Free format text. + - references List of reference objects. + + """ + + def __init__(self): + """Initialize the class.""" + self.accession = "" + self.prosite_refs = [] + self.text = "" + self.references = [] + + +class Reference: + """Holds information from a Prodoc citation. + + Attributes: + - number Number of the reference. (string) + - authors Names of the authors. + - citation Describes the citation. + + """ + + def __init__(self): + """Initialize the class.""" + self.number = "" + self.authors = "" + self.citation = "" + + +# Below are private functions + + +def __read_prosite_reference_line(record, line): + line = line.rstrip() + if line[-1] != "}": + raise ValueError("I don't understand the Prosite reference on line\n%s" % line) + acc, name = line[1:-1].split("; ") + record.prosite_refs.append((acc, name)) + + +def __read_text_line(record, line): + record.text += line + return True + + +def __read_reference_start(record, line): + # Read the references + reference = Reference() + reference.number = line[1:3].strip() + if line[1] == "E": + # If it's an electronic reference, then the URL is on the + # line, instead of the author. + reference.citation = line[4:].strip() + else: + reference.authors = line[4:].strip() + record.references.append(reference) + + +def __read_reference_line(record, line): + if not line.strip(): + return False + reference = record.references[-1] + if line.startswith(" "): + if reference.authors[-1] == ",": + reference.authors += line[4:].rstrip() + else: + reference.citation += line[5:] + return True + raise Exception("I don't understand the reference line\n%s" % line) + + +def __read_copyright_line(record, line): + # Skip the copyright statement + if line.startswith("+----"): + return False + return True + + +def __read(handle): + # Skip blank lines between records + for line in handle: + line = line.rstrip() + if line and not line.startswith("//"): + break + else: + return None + record = Record() + # Read the accession number + if not line.startswith("{PDOC"): + raise ValueError("Line does not start with '{PDOC':\n%s" % line) + if line[-1] != "}": + raise ValueError("I don't understand accession line\n%s" % line) + record.accession = line[1:-1] + # Read the Prosite references + for line in handle: + if line.startswith("{PS"): + __read_prosite_reference_line(record, line) + else: + break + else: + raise ValueError("Unexpected end of stream.") + # Read the actual text + if not line.startswith("{BEGIN"): + raise ValueError("Line does not start with '{BEGIN':\n%s" % line) + read_line = __read_text_line + for line in handle: + if line.startswith("{END}"): + # Clean up the record and return + for reference in record.references: + reference.citation = reference.citation.rstrip() + reference.authors = reference.authors.rstrip() + return record + elif line[0] == "[" and line[3] == "]" and line[4] == " ": + __read_reference_start(record, line) + read_line = __read_reference_line + elif line.startswith("+----"): + read_line = __read_copyright_line + elif read_line: + if not read_line(record, line): + read_line = None + raise ValueError("Unexpected end of stream.") diff --git a/code/lib/Bio/ExPASy/Prosite.py b/code/lib/Bio/ExPASy/Prosite.py new file mode 100644 index 0000000..9174db8 --- /dev/null +++ b/code/lib/Bio/ExPASy/Prosite.py @@ -0,0 +1,314 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# Copyright 2000 by Jeffrey Chang. All rights reserved. +# Revisions Copyright 2007 by Peter Cock. All rights reserved. +# Revisions Copyright 2009 by Michiel de Hoon. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Parser for the prosite dat file from Prosite at ExPASy. + +See https://www.expasy.org/prosite/ + +Tested with: + - Release 20.43, 10-Feb-2009 + - Release 2017_03 of 15-Mar-2017. + +Functions: + - read Reads a Prosite file containing one Prosite record + - parse Iterates over records in a Prosite file. + +Classes: + - Record Holds Prosite data. + +""" + + +def parse(handle): + """Parse Prosite records. + + This function is for parsing Prosite files containing multiple + records. + + Arguments: + - handle - handle to the file. + + """ + while True: + record = __read(handle) + if not record: + break + yield record + + +def read(handle): + """Read one Prosite record. + + This function is for parsing Prosite files containing + exactly one record. + + Arguments: + - handle - handle to the file. + + """ + record = __read(handle) + # We should have reached the end of the record by now + remainder = handle.read() + if remainder: + raise ValueError("More than one Prosite record found") + return record + + +class Record: + """Holds information from a Prosite record. + + Main attributes: + - name ID of the record. e.g. ADH_ZINC + - type Type of entry. e.g. PATTERN, MATRIX, or RULE + - accession e.g. PS00387 + - created Date the entry was created. (MMM-YYYY for releases + before January 2017, DD-MMM-YYYY since January 2017) + - data_update Date the 'primary' data was last updated. + - info_update Date data other than 'primary' data was last updated. + - pdoc ID of the PROSITE DOCumentation. + - description Free-format description. + - pattern The PROSITE pattern. See docs. + - matrix List of strings that describes a matrix entry. + - rules List of rule definitions (from RU lines). (strings) + - prorules List of prorules (from PR lines). (strings) + + NUMERICAL RESULTS: + - nr_sp_release SwissProt release. + - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) + - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) + - nr_positive True positives. tuple of (hits, seqs) + - nr_unknown Could be positives. tuple of (hits, seqs) + - nr_false_pos False positives. tuple of (hits, seqs) + - nr_false_neg False negatives. (int) + - nr_partial False negatives, because they are fragments. (int) + + COMMENTS: + - cc_taxo_range Taxonomic range. See docs for format + - cc_max_repeat Maximum number of repetitions in a protein + - cc_site Interesting site. list of tuples (pattern pos, desc.) + - cc_skip_flag Can this entry be ignored? + - cc_matrix_type + - cc_scaling_db + - cc_author + - cc_ft_key + - cc_ft_desc + - cc_version version number (introduced in release 19.0) + + The following are all lists if tuples (swiss-prot accession, swiss-prot name). + + DATA BANK REFERENCES: + - dr_positive + - dr_false_neg + - dr_false_pos + - dr_potential Potential hits, but fingerprint region not yet available. + - dr_unknown Could possibly belong + - pdb_structs List of PDB entries. + + """ + + def __init__(self): + """Initialize the class.""" + self.name = "" + self.type = "" + self.accession = "" + self.created = "" + self.data_update = "" + self.info_update = "" + self.pdoc = "" + + self.description = "" + self.pattern = "" + self.matrix = [] + self.rules = [] + self.prorules = [] + self.postprocessing = [] + + self.nr_sp_release = "" + self.nr_sp_seqs = "" + self.nr_total = (None, None) + self.nr_positive = (None, None) + self.nr_unknown = (None, None) + self.nr_false_pos = (None, None) + self.nr_false_neg = None + self.nr_partial = None + + self.cc_taxo_range = "" + self.cc_max_repeat = "" + self.cc_site = [] + self.cc_skip_flag = "" + + self.dr_positive = [] + self.dr_false_neg = [] + self.dr_false_pos = [] + self.dr_potential = [] + self.dr_unknown = [] + + self.pdb_structs = [] + + +# Everything below are private functions + + +def __read(handle): + import re + + record = None + for line in handle: + keyword, value = line[:2], line[5:].rstrip() + if keyword == "ID": + record = Record() + cols = value.split("; ") + if len(cols) != 2: + raise ValueError("I don't understand identification line\n%s" % line) + record.name = cols[0] + record.type = cols[1].rstrip(".") # don't want '.' + elif keyword == "AC": + record.accession = value.rstrip(";") + elif keyword == "DT": + # e.g. from January 2017, + # DT 01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE. + # Older files had brackets round the date descriptions and used MMM-YYYY + dates = value.rstrip(".").split("; ") + if dates[0].endswith((" (CREATED)", " CREATED")): + # Remove last word + record.created = dates[0].rsplit(" ", 1)[0] + else: + raise ValueError("I don't understand date line\n%s" % line) + if dates[1].endswith((" (DATA UPDATE)", " DATA UPDATE")): + # Remove last two words + record.data_update = dates[1].rsplit(" ", 2)[0] + else: + raise ValueError("I don't understand date line\n%s" % line) + if dates[2].endswith((" (INFO UPDATE)", " INFO UPDATE")): + # Remove last two words + record.info_update = dates[2].rsplit(" ", 2)[0] + else: + raise ValueError("I don't understand date line\n%s" % line) + elif keyword == "DE": + record.description = value + elif keyword == "PA": + record.pattern += value + elif keyword == "MA": + record.matrix.append(value) + elif keyword == "PP": + record.postprocessing.extend(value.split(";")) + elif keyword == "RU": + record.rules.append(value) + elif keyword == "NR": + cols = value.split(";") + for col in cols: + if not col: + continue + qual, data = [word.lstrip() for word in col.split("=")] + if qual == "/RELEASE": + release, seqs = data.split(",") + record.nr_sp_release = release + record.nr_sp_seqs = int(seqs) + elif qual == "/FALSE_NEG": + record.nr_false_neg = int(data) + elif qual == "/PARTIAL": + record.nr_partial = int(data) + elif qual in ["/TOTAL", "/POSITIVE", "/UNKNOWN", "/FALSE_POS"]: + m = re.match(r"(\d+)\((\d+)\)", data) + if not m: + raise Exception( + "Broken data %s in comment line\n%r" % (data, line) + ) + hits = tuple(map(int, m.groups())) + if qual == "/TOTAL": + record.nr_total = hits + elif qual == "/POSITIVE": + record.nr_positive = hits + elif qual == "/UNKNOWN": + record.nr_unknown = hits + elif qual == "/FALSE_POS": + record.nr_false_pos = hits + else: + raise ValueError( + "Unknown qual %s in comment line\n%r" % (qual, line) + ) + elif keyword == "CC": + # Expect CC lines like this: + # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; + # Can (normally) split on ";" and then on "=" + cols = value.split(";") + for col in cols: + if not col or col[:17] == "Automatic scaling": + # DNAJ_2 in Release 15 has a non-standard comment line: + # CC Automatic scaling using reversed database + # Throw it away. (Should I keep it?) + continue + if col.count("=") == 0: + # Missing qualifier! Can we recover gracefully? + # For example, from Bug 2403, in PS50293 have: + # CC /AUTHOR=K_Hofmann; N_Hulo + continue + qual, data = [word.lstrip() for word in col.split("=")] + if qual == "/TAXO-RANGE": + record.cc_taxo_range = data + elif qual == "/MAX-REPEAT": + record.cc_max_repeat = data + elif qual == "/SITE": + pos, desc = data.split(",") + record.cc_site.append((int(pos), desc)) + elif qual == "/SKIP-FLAG": + record.cc_skip_flag = data + elif qual == "/MATRIX_TYPE": + record.cc_matrix_type = data + elif qual == "/SCALING_DB": + record.cc_scaling_db = data + elif qual == "/AUTHOR": + record.cc_author = data + elif qual == "/FT_KEY": + record.cc_ft_key = data + elif qual == "/FT_DESC": + record.cc_ft_desc = data + elif qual == "/VERSION": + record.cc_version = data + else: + raise ValueError( + "Unknown qual %s in comment line\n%r" % (qual, line) + ) + elif keyword == "DR": + refs = value.split(";") + for ref in refs: + if not ref: + continue + acc, name, type = [word.strip() for word in ref.split(",")] + if type == "T": + record.dr_positive.append((acc, name)) + elif type == "F": + record.dr_false_pos.append((acc, name)) + elif type == "N": + record.dr_false_neg.append((acc, name)) + elif type == "P": + record.dr_potential.append((acc, name)) + elif type == "?": + record.dr_unknown.append((acc, name)) + else: + raise ValueError("I don't understand type flag %s" % type) + elif keyword == "3D": + cols = value.split() + for id in cols: + record.pdb_structs.append(id.rstrip(";")) + elif keyword == "PR": + rules = value.split(";") + record.prorules.extend(rules) + elif keyword == "DO": + record.pdoc = value.rstrip(";") + elif keyword == "//": + if not record: + # Then this was the copyright statement + continue + break + else: + raise ValueError("Unknown keyword %s found" % keyword) + else: + return + if not record: + raise ValueError("Unexpected end of stream.") + return record diff --git a/code/lib/Bio/ExPASy/ScanProsite.py b/code/lib/Bio/ExPASy/ScanProsite.py new file mode 100644 index 0000000..3403703 --- /dev/null +++ b/code/lib/Bio/ExPASy/ScanProsite.py @@ -0,0 +1,145 @@ +# Copyright 2009 by Michiel de Hoon. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code for calling and parsing ScanProsite from ExPASy.""" + +# Importing these functions with leading underscore as not intended for reuse +from urllib.request import urlopen +from urllib.parse import urlencode + +from xml.sax import handler +from xml.sax.expatreader import ExpatParser + + +class Record(list): + """Represents search results returned by ScanProsite. + + This record is a list containing the search results returned by + ScanProsite. The record also contains the data members n_match, + n_seq, capped, and warning. + """ + + def __init__(self): + """Initialize the class.""" + self.n_match = None + self.n_seq = None + self.capped = None + self.warning = None + + +def scan(seq="", mirror="https://www.expasy.org", output="xml", **keywords): + """Execute a ScanProsite search. + + Arguments: + - mirror: The ScanProsite mirror to be used + (default: https://www.expasy.org). + - seq: The query sequence, or UniProtKB (Swiss-Prot, + TrEMBL) accession + - output: Format of the search results + (default: xml) + + Further search parameters can be passed as keywords; see the + documentation for programmatic access to ScanProsite at + https://www.expasy.org/tools/scanprosite/ScanPrositeREST.html + for a description of such parameters. + + This function returns a handle to the search results returned by + ScanProsite. Search results in the XML format can be parsed into a + Python object, by using the Bio.ExPASy.ScanProsite.read function. + + """ + parameters = {"seq": seq, "output": output} + for key, value in keywords.items(): + if value is not None: + parameters[key] = value + command = urlencode(parameters) + url = "%s/cgi-bin/prosite/PSScan.cgi?%s" % (mirror, command) + handle = urlopen(url) + return handle + + +def read(handle): + """Parse search results returned by ScanProsite into a Python object.""" + content_handler = ContentHandler() + saxparser = Parser() + saxparser.setContentHandler(content_handler) + saxparser.parse(handle) + record = content_handler.record + return record + + +# The classes below are considered private + + +class Parser(ExpatParser): + """Process the result from a ScanProsite search (PRIVATE).""" + + def __init__(self): + """Initialize the class.""" + ExpatParser.__init__(self) + self.firsttime = True + + def feed(self, data, isFinal=0): + """Raise an Error if plain text is received in the data. + + This is to show the Error messages returned by ScanProsite. + """ + # Error messages returned by the ScanProsite server are formatted as + # as plain text instead of an XML document. To catch such error + # messages, we override the feed method of the Expat parser. + # The error message is (hopefully) contained in the data that was just + # fed to the parser. + if self.firsttime: + if data[:5].decode("utf-8") != ">> from Bio import ExPASy + >>> import os + >>> with ExPASy.get_prodoc_entry('PDOC00001') as in_handle: + ... html = in_handle.read() + ... + >>> with open("myprodocrecord.html", "w") as out_handle: + ... length = out_handle.write(html) + ... + >>> os.remove("myprodocrecord.html") # tidy up + + For a non-existing key XXX, ExPASy returns an HTML-formatted page + containing this text: 'There is currently no PROSITE entry for' + """ + return _open("%s?%s" % (cgi, id)) + + +def get_prosite_entry( + id, cgi="https://prosite.expasy.org/cgi-bin/prosite/get-prosite-entry" +): + """Get a text handle to a PROSITE entry at ExPASy in HTML format. + + >>> from Bio import ExPASy + >>> import os + >>> with ExPASy.get_prosite_entry('PS00001') as in_handle: + ... html = in_handle.read() + ... + >>> with open("myprositerecord.html", "w") as out_handle: + ... length = out_handle.write(html) + ... + >>> os.remove("myprositerecord.html") # tidy up + + For a non-existing key XXX, ExPASy returns an HTML-formatted page + containing this text: 'There is currently no PROSITE entry for' + """ + return _open("%s?%s" % (cgi, id)) + + +def get_prosite_raw(id, cgi=None): + """Get a text handle to a raw PROSITE or PRODOC record at ExPASy. + + The cgi argument is deprecated due to changes in the ExPASy + website. + + >>> from Bio import ExPASy + >>> from Bio.ExPASy import Prosite + >>> with ExPASy.get_prosite_raw('PS00001') as handle: + ... record = Prosite.read(handle) + ... + >>> print(record.accession) + PS00001 + + This function raises a ValueError if the identifier does not exist: + + >>> handle = ExPASy.get_prosite_raw("DOES_NOT_EXIST") + Traceback (most recent call last): + ... + ValueError: Failed to find entry 'DOES_NOT_EXIST' on ExPASy + + """ + handle = _open("https://prosite.expasy.org/%s.txt" % id) + if handle.url == "https://www.expasy.org/": + raise ValueError("Failed to find entry '%s' on ExPASy" % id) from None + return handle + + +def get_sprot_raw(id): + """Get a text handle to a raw SwissProt entry at ExPASy. + + For an ID of XXX, fetches http://www.uniprot.org/uniprot/XXX.txt + (as per the https://www.expasy.org/expasy_urls.html documentation). + + >>> from Bio import ExPASy + >>> from Bio import SwissProt + >>> with ExPASy.get_sprot_raw("O23729") as handle: + ... record = SwissProt.read(handle) + ... + >>> print(record.entry_name) + CHS3_BROFI + + This function raises a ValueError if the identifier does not exist: + + >>> ExPASy.get_sprot_raw("DOES_NOT_EXIST") + Traceback (most recent call last): + ... + ValueError: Failed to find SwissProt entry 'DOES_NOT_EXIST' + + """ + try: + handle = _open("http://www.uniprot.org/uniprot/%s.txt" % id) + except HTTPError as exception: + if exception.code == 404: + raise ValueError("Failed to find SwissProt entry '%s'" % id) from None + else: + raise + return handle + + +def _open(url): + """Open URL and convert to text assuming UTF-8 encoding (PRIVATE).""" + handle = urlopen(url) + text_handle = io.TextIOWrapper(handle, encoding="UTF-8") + text_handle.url = handle.url + return text_handle diff --git a/code/lib/Bio/ExPASy/__pycache__/Enzyme.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/Enzyme.cpython-37.pyc new file mode 100644 index 0000000..095f873 Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/Enzyme.cpython-37.pyc differ diff --git a/code/lib/Bio/ExPASy/__pycache__/Prodoc.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/Prodoc.cpython-37.pyc new file mode 100644 index 0000000..90a27a5 Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/Prodoc.cpython-37.pyc differ diff --git a/code/lib/Bio/ExPASy/__pycache__/Prosite.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/Prosite.cpython-37.pyc new file mode 100644 index 0000000..4390c2b Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/Prosite.cpython-37.pyc differ diff --git a/code/lib/Bio/ExPASy/__pycache__/ScanProsite.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/ScanProsite.cpython-37.pyc new file mode 100644 index 0000000..3b41129 Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/ScanProsite.cpython-37.pyc differ diff --git a/code/lib/Bio/ExPASy/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..1e75ef7 Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/ExPASy/__pycache__/cellosaurus.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/cellosaurus.cpython-37.pyc new file mode 100644 index 0000000..6930c02 Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/cellosaurus.cpython-37.pyc differ diff --git a/code/lib/Bio/ExPASy/cellosaurus.py b/code/lib/Bio/ExPASy/cellosaurus.py new file mode 100644 index 0000000..8794cd3 --- /dev/null +++ b/code/lib/Bio/ExPASy/cellosaurus.py @@ -0,0 +1,188 @@ +# Copyright 2016 by Stephen Marshall. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Parser for the cellosaurus.txt file from ExPASy. + +See https://web.expasy.org/cellosaurus/ + +Tested with the release of Version 18 (July 2016). + +Functions: + - read Reads a file containing one cell line entry + - parse Reads a file containing multiple cell line entries + +Classes: + - Record Holds cell line data. + +Examples +-------- +You need to download the Cellosaurus database for this examples to +run, e.g. from ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt + + >> from Bio.ExPASy import cellosaurus + >> with open('cellosaurus.txt') as handle: + ... records = cellosaurus.parse(handle) + ... for record in records: + ... if 'Homo sapiens' in record['OX'][0]: + ... print(record['ID']) + ... + #15310-LN + #W7079 + (L)PC6 + 00136 + ... + +""" + + +def parse(handle): + """Parse cell line records. + + This function is for parsing cell line files containing multiple + records. + + Arguments: + - handle - handle to the file. + + """ + while True: + record = __read(handle) + if not record: + break + yield record + + +def read(handle): + """Read one cell line record. + + This function is for parsing cell line files containing + exactly one record. + + Arguments: + - handle - handle to the file. + + """ + record = __read(handle) + # We should have reached the end of the record by now + remainder = handle.read() + if remainder: + raise ValueError("More than one cell line record found") + return record + + +class Record(dict): + """Holds information from an ExPASy Cellosaurus record as a Python dictionary. + + Each record contains the following keys: + + --------- --------------------------- ---------------------- + Line code Content Occurrence in an entry + --------- --------------------------- ---------------------- + ID Identifier (cell line name) Once; starts an entry + AC Accession (CVCL_xxxx) Once + AS Secondary accession number(s) Optional; once + SY Synonyms Optional; once + DR Cross-references Optional; once or more + RX References identifiers Optional: once or more + WW Web pages Optional; once or more + CC Comments Optional; once or more + ST STR profile data Optional; once or more + DI Diseases Optional; once or more + OX Species of origin Once or more + HI Hierarchy Optional; once or more + OI Originate from same individual Optional; once or more + SX Sex (gender) of cell Optional; once + CA Category Once + // Terminator Once; ends an entry + + """ + + def __init__(self): + """Initialize the class.""" + dict.__init__(self) + self["ID"] = "" + self["AC"] = "" + self["AS"] = "" + self["SY"] = "" + self["DR"] = [] + self["RX"] = [] + self["WW"] = [] + self["CC"] = [] + self["ST"] = [] + self["DI"] = [] + self["OX"] = [] + self["HI"] = [] + self["OI"] = [] + self["SX"] = "" + self["CA"] = "" + + def __repr__(self): + if self["ID"]: + if self["AC"]: + return "%s (%s, %s)" % (self.__class__.__name__, self["ID"], self["AC"]) + else: + return "%s (%s)" % (self.__class__.__name__, self["ID"]) + else: + return "%s ( )" % (self.__class__.__name__) + + def __str__(self): + output = "ID: " + self["ID"] + output += " AC: " + self["AC"] + output += " AS: " + self["AS"] + output += " SY: " + self["SY"] + output += " DR: " + repr(self["DR"]) + output += " RX: " + repr(self["RX"]) + output += " WW: " + repr(self["WW"]) + output += " CC: " + repr(self["CC"]) + output += " ST: " + repr(self["ST"]) + output += " DI: " + repr(self["DI"]) + output += " OX: " + repr(self["OX"]) + output += " HI: " + repr(self["HI"]) + output += " OI: " + repr(self["OI"]) + output += " SX: " + self["SX"] + output += " CA: " + self["CA"] + return output + + +# Everything below is private + + +def __read(handle): + record = None + + for line in handle: + + key, value = line[:2], line[5:].rstrip() + if key == "ID": + record = Record() + record["ID"] = value + elif key in ["AC", "AS", "SY", "SX", "CA"]: + record[key] += value + elif key in [ + "AC", + "AS", + "SY", + "RX", + "WW", + "CC", + "ST", + "DI", + "OX", + "HI", + "OI", + "SX", + "CA", + ]: + record[key].append(value) + elif key == "DR": + k, v = value.split(";") + record["DR"].append((k.strip(), v.strip())) + elif key == "//": + if record: + return record + else: + continue + if record: + raise ValueError("Unexpected end of stream") diff --git a/code/lib/Bio/File.py b/code/lib/Bio/File.py new file mode 100644 index 0000000..5edec51 --- /dev/null +++ b/code/lib/Bio/File.py @@ -0,0 +1,609 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# Copyright 2009-2018 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Code for more fancy file handles. + +Bio.File defines private classes used in Bio.SeqIO and Bio.SearchIO for +indexing files. These are not intended for direct use. +""" + +import os +import contextlib +import itertools +import collections.abc + +from abc import ABC, abstractmethod + +try: + import sqlite3 +except ImportError: + # May be missing if Python was compiled from source without its dependencies + sqlite3 = None + + +@contextlib.contextmanager +def as_handle(handleish, mode="r", **kwargs): + r"""Context manager to ensure we are using a handle. + + Context manager for arguments that can be passed to SeqIO and AlignIO read, write, + and parse methods: either file objects or path-like objects (strings, pathlib.Path + instances, or more generally, anything that can be handled by the builtin 'open' + function). + + When given a path-like object, returns an open file handle to that path, with provided + mode, which will be closed when the manager exits. + + All other inputs are returned, and are *not* closed. + + Arguments: + - handleish - Either a file handle or path-like object (anything which can be + passed to the builtin 'open' function, such as str, bytes, + pathlib.Path, and os.DirEntry objects) + - mode - Mode to open handleish (used only if handleish is a string) + - kwargs - Further arguments to pass to open(...) + + Examples + -------- + >>> from Bio import File + >>> import os + >>> with File.as_handle('seqs.fasta', 'w') as fp: + ... fp.write('>test\nACGT') + ... + 10 + >>> fp.closed + True + + >>> handle = open('seqs.fasta', 'w') + >>> with File.as_handle(handle) as fp: + ... fp.write('>test\nACGT') + ... + 10 + >>> fp.closed + False + >>> fp.close() + >>> os.remove("seqs.fasta") # tidy up + + """ + try: + with open(handleish, mode, **kwargs) as fp: + yield fp + except TypeError: + yield handleish + + +def _open_for_random_access(filename): + """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE). + + This functionality is used by the Bio.SeqIO and Bio.SearchIO index + and index_db functions. + + If the file is gzipped but not BGZF, a specific ValueError is raised. + """ + handle = open(filename, "rb") + magic = handle.read(2) + handle.seek(0) + + if magic == b"\x1f\x8b": + # This is a gzipped file, but is it BGZF? + from . import bgzf + + try: + # If it is BGZF, we support that + return bgzf.BgzfReader(mode="rb", fileobj=handle) + except ValueError as e: + assert "BGZF" in str(e) + # Not a BGZF file after all, + handle.close() + raise ValueError( + "Gzipped files are not suitable for indexing, " + "please use BGZF (blocked gzip format) instead." + ) from None + + return handle + + +# The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO +# for indexing + + +class _IndexedSeqFileProxy(ABC): + """Abstract base class for file format specific random access (PRIVATE). + + This is subclasses in both Bio.SeqIO for indexing as SeqRecord + objects, and in Bio.SearchIO for indexing QueryResult objects. + + Subclasses for each file format should define '__iter__', 'get' + and optionally 'get_raw' methods. + """ + + @abstractmethod + def __iter__(self): + """Return (identifier, offset, length in bytes) tuples. + + The length can be zero where it is not implemented or not + possible for a particular file format. + """ + raise NotImplementedError + + @abstractmethod + def get(self, offset): + """Return parsed object for this entry.""" + # Most file formats with self contained records can be handled by + # parsing StringIO(self.get_raw(offset).decode()) + raise NotImplementedError + + def get_raw(self, offset): + """Return the raw record from the file as a bytes string (if implemented). + + If the key is not found, a KeyError exception is raised. + + This may not have been implemented for all file formats. + """ + # Should be done by each sub-class (if possible) + raise NotImplementedError("Not available for this file format.") + + +class _IndexedSeqFileDict(collections.abc.Mapping): + """Read only dictionary interface to a sequential record file. + + This code is used in both Bio.SeqIO for indexing as SeqRecord + objects, and in Bio.SearchIO for indexing QueryResult objects. + + Keeps the keys and associated file offsets in memory, reads the file + to access entries as objects parsing them on demand. This approach + is memory limited, but will work even with millions of records. + + Note duplicate keys are not allowed. If this happens, a ValueError + exception is raised. + + As used in Bio.SeqIO, by default the SeqRecord's id string is used + as the dictionary key. In Bio.SearchIO, the query's id string is + used. This can be changed by supplying an optional key_function, + a callback function which will be given the record id and must + return the desired key. For example, this allows you to parse + NCBI style FASTA identifiers, and extract the GI number to use + as the dictionary key. + + Note that this dictionary is essentially read only. You cannot + add or change values, pop values, nor clear the dictionary. + """ + + def __init__(self, random_access_proxy, key_function, repr, obj_repr): + """Initialize the class.""" + # Use key_function=None for default value + self._proxy = random_access_proxy + self._key_function = key_function + self._repr = repr + self._obj_repr = obj_repr + if key_function: + offset_iter = ((key_function(k), o, l) for (k, o, l) in random_access_proxy) + else: + offset_iter = random_access_proxy + offsets = {} + for key, offset, length in offset_iter: + # Note - we don't store the length because I want to minimise the + # memory requirements. With the SQLite backend the length is kept + # and is used to speed up the get_raw method (by about 3 times). + # The length should be provided by all the current backends except + # SFF where there is an existing Roche index we can reuse (very fast + # but lacks the record lengths) + # assert length or format in ["sff", "sff-trim"], \ + # "%s at offset %i given length %r (%s format %s)" \ + # % (key, offset, length, filename, format) + if key in offsets: + self._proxy._handle.close() + raise ValueError("Duplicate key '%s'" % key) + else: + offsets[key] = offset + self._offsets = offsets + + def __repr__(self): + """Return a string representation of the File object.""" + return self._repr + + def __str__(self): + """Create a string representation of the File object.""" + # TODO - How best to handle the __str__ for SeqIO and SearchIO? + if self: + return "{%r : %s(...), ...}" % (list(self.keys())[0], self._obj_repr) + else: + return "{}" + + def __len__(self): + """Return the number of records.""" + return len(self._offsets) + + def __iter__(self): + """Iterate over the keys.""" + return iter(self._offsets) + + def __getitem__(self, key): + """Return record for the specified key.""" + # Pass the offset to the proxy + record = self._proxy.get(self._offsets[key]) + if self._key_function: + key2 = self._key_function(record.id) + else: + key2 = record.id + if key != key2: + raise ValueError("Key did not match (%s vs %s)" % (key, key2)) + return record + + def get_raw(self, key): + """Return the raw record from the file as a bytes string. + + If the key is not found, a KeyError exception is raised. + """ + # Pass the offset to the proxy + return self._proxy.get_raw(self._offsets[key]) + + def close(self): + """Close the file handle being used to read the data. + + Once called, further use of the index won't work. The sole purpose + of this method is to allow explicit handle closure - for example + if you wish to delete the file, on Windows you must first close + all open handles to that file. + """ + self._proxy._handle.close() + + +class _SQLiteManySeqFilesDict(_IndexedSeqFileDict): + """Read only dictionary interface to many sequential record files. + + This code is used in both Bio.SeqIO for indexing as SeqRecord + objects, and in Bio.SearchIO for indexing QueryResult objects. + + Keeps the keys, file-numbers and offsets in an SQLite database. To access + a record by key, reads from the offset in the appropriate file and then + parses the record into an object. + + There are OS limits on the number of files that can be open at once, + so a pool are kept. If a record is required from a closed file, then + one of the open handles is closed first. + """ + + def __init__( + self, + index_filename, + filenames, + proxy_factory, + fmt, + key_function, + repr, + max_open=10, + ): + """Initialize the class.""" + # TODO? - Don't keep filename list in memory (just in DB)? + # Should save a chunk of memory if dealing with 1000s of files. + # Furthermore could compare a generator to the DB on reloading + # (no need to turn it into a list) + + if sqlite3 is None: + # Python was compiled without sqlite3 support + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError( + "Python was compiled without the sqlite3 module" + ) + if filenames is not None: + filenames = list(filenames) # In case it was a generator + + # Cache the arguments as private variables + self._index_filename = index_filename + self._filenames = filenames + self._format = fmt + self._key_function = key_function + self._proxy_factory = proxy_factory + self._repr = repr + self._max_open = max_open + self._proxies = {} + + # Note if using SQLite :memory: trick index filename, this will + # give $PWD as the relative path (which is fine). + self._relative_path = os.path.abspath(os.path.dirname(index_filename)) + + if os.path.isfile(index_filename): + self._load_index() + else: + self._build_index() + + def _load_index(self): + """Call from __init__ to re-use an existing index (PRIVATE).""" + index_filename = self._index_filename + relative_path = self._relative_path + filenames = self._filenames + fmt = self._format + proxy_factory = self._proxy_factory + + con = sqlite3.dbapi2.connect(index_filename, check_same_thread=False) + self._con = con + # Check the count... + try: + (count,) = con.execute( + "SELECT value FROM meta_data WHERE key=?;", ("count",) + ).fetchone() + self._length = int(count) + if self._length == -1: + con.close() + raise ValueError("Unfinished/partial database") from None + + # use MAX(_ROWID_) to obtain the number of sequences in the database + # using COUNT(key) is quite slow in SQLITE + # (https://stackoverflow.com/questions/8988915/sqlite-count-slow-on-big-tables) + (count,) = con.execute("SELECT MAX(_ROWID_) FROM offset_data;").fetchone() + if self._length != int(count): + con.close() + raise ValueError( + "Corrupt database? %i entries not %i" % (int(count), self._length) + ) from None + (self._format,) = con.execute( + "SELECT value FROM meta_data WHERE key=?;", ("format",) + ).fetchone() + if fmt and fmt != self._format: + con.close() + raise ValueError( + "Index file says format %s, not %s" % (self._format, fmt) + ) from None + try: + (filenames_relative_to_index,) = con.execute( + "SELECT value FROM meta_data WHERE key=?;", + ("filenames_relative_to_index",), + ).fetchone() + filenames_relative_to_index = ( + filenames_relative_to_index.upper() == "TRUE" + ) + except TypeError: + # Original behaviour, assume if meta_data missing + filenames_relative_to_index = False + self._filenames = [ + row[0] + for row in con.execute( + "SELECT name FROM file_data ORDER BY file_number;" + ).fetchall() + ] + if filenames_relative_to_index: + # Not implicitly relative to $PWD, explicitly relative to index file + relative_path = os.path.abspath(os.path.dirname(index_filename)) + tmp = [] + for f in self._filenames: + if os.path.isabs(f): + tmp.append(f) + else: + # Would be stored with Unix / path separator, so convert + # it to the local OS path separator here: + tmp.append( + os.path.join(relative_path, f.replace("/", os.path.sep)) + ) + self._filenames = tmp + del tmp + if filenames and len(filenames) != len(self._filenames): + con.close() + raise ValueError( + "Index file says %i files, not %i" + % (len(self._filenames), len(filenames)) + ) from None + if filenames and filenames != self._filenames: + for old, new in zip(self._filenames, filenames): + # Want exact match (after making relative to the index above) + if os.path.abspath(old) != os.path.abspath(new): + con.close() + if filenames_relative_to_index: + raise ValueError( + "Index file has different filenames, e.g. %r != %r" + % (os.path.abspath(old), os.path.abspath(new)) + ) from None + else: + raise ValueError( + "Index file has different filenames " + "[This is an old index where any relative paths " + "were relative to the original working directory]. " + "e.g. %r != %r" + % (os.path.abspath(old), os.path.abspath(new)) + ) from None + # Filenames are equal (after imposing abspath) + except sqlite3.OperationalError as err: + con.close() + raise ValueError("Not a Biopython index database? %s" % err) from None + # Now we have the format (from the DB if not given to us), + if not proxy_factory(self._format): + con.close() + raise ValueError("Unsupported format '%s'" % self._format) + + def _build_index(self): + """Call from __init__ to create a new index (PRIVATE).""" + index_filename = self._index_filename + relative_path = self._relative_path + filenames = self._filenames + fmt = self._format + key_function = self._key_function + proxy_factory = self._proxy_factory + max_open = self._max_open + random_access_proxies = self._proxies + + if not fmt or not filenames: + raise ValueError( + "Filenames to index and format required to build %r" % index_filename + ) + if not proxy_factory(fmt): + raise ValueError("Unsupported format '%s'" % fmt) + # Create the index + con = sqlite3.dbapi2.connect(index_filename) + self._con = con + # print("Creating index") + # Sqlite PRAGMA settings for speed + con.execute("PRAGMA synchronous=OFF") + con.execute("PRAGMA locking_mode=EXCLUSIVE") + # Don't index the key column until the end (faster) + # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " + # "offset INTEGER);") + con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") + con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("count", -1)) + con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("format", fmt)) + con.execute( + "INSERT INTO meta_data (key, value) VALUES (?,?);", + ("filenames_relative_to_index", "True"), + ) + # TODO - Record the file size and modified date? + con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);") + con.execute( + "CREATE TABLE offset_data (key TEXT, " + "file_number INTEGER, offset INTEGER, length INTEGER);" + ) + count = 0 + for i, filename in enumerate(filenames): + # Default to storing as an absolute path, + f = os.path.abspath(filename) + if not os.path.isabs(filename) and not os.path.isabs(index_filename): + # Since user gave BOTH filename & index as relative paths, + # we will store this relative to the index file even though + # if it may now start ../ (meaning up a level) + # Note for cross platform use (e.g. shared drive over SAMBA), + # convert any Windows slash into Unix style for rel paths. + f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/") + elif (os.path.dirname(os.path.abspath(filename)) + os.path.sep).startswith( + relative_path + os.path.sep + ): + # Since sequence file is in same directory or sub directory, + # might as well make this into a relative path: + f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/") + assert not f.startswith("../"), f + # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f)) + con.execute( + "INSERT INTO file_data (file_number, name) VALUES (?,?);", (i, f) + ) + random_access_proxy = proxy_factory(fmt, filename) + if key_function: + offset_iter = ( + (key_function(k), i, o, l) for (k, o, l) in random_access_proxy + ) + else: + offset_iter = ((k, i, o, l) for (k, o, l) in random_access_proxy) + while True: + batch = list(itertools.islice(offset_iter, 100)) + if not batch: + break + # print("Inserting batch of %i offsets, %s ... %s" + # % (len(batch), batch[0][0], batch[-1][0])) + con.executemany( + "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", + batch, + ) + con.commit() + count += len(batch) + if len(random_access_proxies) < max_open: + random_access_proxies[i] = random_access_proxy + else: + random_access_proxy._handle.close() + self._length = count + # print("About to index %i entries" % count) + try: + con.execute( + "CREATE UNIQUE INDEX IF NOT EXISTS key_index ON offset_data(key);" + ) + except sqlite3.IntegrityError as err: + self._proxies = random_access_proxies + self.close() + con.close() + raise ValueError("Duplicate key? %s" % err) from None + con.execute("PRAGMA locking_mode=NORMAL") + con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", (count, "count")) + con.commit() + # print("Index created") + + def __repr__(self): + return self._repr + + def __contains__(self, key): + return bool( + self._con.execute( + "SELECT key FROM offset_data WHERE key=?;", (key,) + ).fetchone() + ) + + def __len__(self): + """Return the number of records indexed.""" + return self._length + # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] + + def __iter__(self): + """Iterate over the keys.""" + for row in self._con.execute( + "SELECT key FROM offset_data ORDER BY file_number, offset;" + ): + yield str(row[0]) + + def __getitem__(self, key): + """Return record for the specified key.""" + # Pass the offset to the proxy + row = self._con.execute( + "SELECT file_number, offset FROM offset_data WHERE key=?;", (key,) + ).fetchone() + if not row: + raise KeyError + file_number, offset = row + proxies = self._proxies + if file_number in proxies: + record = proxies[file_number].get(offset) + else: + if len(proxies) >= self._max_open: + # Close an old handle... + proxies.popitem()[1]._handle.close() + # Open a new handle... + proxy = self._proxy_factory(self._format, self._filenames[file_number]) + record = proxy.get(offset) + proxies[file_number] = proxy + if self._key_function: + key2 = self._key_function(record.id) + else: + key2 = record.id + if key != key2: + raise ValueError("Key did not match (%s vs %s)" % (key, key2)) + return record + + def get_raw(self, key): + """Return the raw record from the file as a bytes string. + + If the key is not found, a KeyError exception is raised. + """ + # Pass the offset to the proxy + row = self._con.execute( + "SELECT file_number, offset, length FROM offset_data WHERE key=?;", (key,) + ).fetchone() + if not row: + raise KeyError + file_number, offset, length = row + proxies = self._proxies + if file_number in proxies: + if length: + # Shortcut if we have the length + h = proxies[file_number]._handle + h.seek(offset) + return h.read(length) + else: + return proxies[file_number].get_raw(offset) + else: + # This code is duplicated from __getitem__ to avoid a function call + if len(proxies) >= self._max_open: + # Close an old handle... + proxies.popitem()[1]._handle.close() + # Open a new handle... + proxy = self._proxy_factory(self._format, self._filenames[file_number]) + proxies[file_number] = proxy + if length: + # Shortcut if we have the length + h = proxy._handle + h.seek(offset) + return h.read(length) + else: + return proxy.get_raw(offset) + + def close(self): + """Close any open file handles.""" + proxies = self._proxies + while proxies: + proxies.popitem()[1]._handle.close() diff --git a/code/lib/Bio/GenBank/Record.py b/code/lib/Bio/GenBank/Record.py new file mode 100644 index 0000000..268efa8 --- /dev/null +++ b/code/lib/Bio/GenBank/Record.py @@ -0,0 +1,669 @@ +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# + +"""Hold GenBank data in a straightforward format. + +Classes: + - Record - All of the information in a GenBank record. + - Reference - hold reference data for a record. + - Feature - Hold the information in a Feature Table. + - Qualifier - Qualifiers on a Feature. + +""" + +import Bio.GenBank + + +def _wrapped_genbank(information, indent, wrap_space=1, split_char=" "): + """Write a line of GenBank info that can wrap over multiple lines (PRIVATE). + + This takes a line of information which can potentially wrap over + multiple lines, and breaks it up with carriage returns and + indentation so it fits properly into a GenBank record. + + Arguments: + - information - The string holding the information we want + wrapped in GenBank method. + - indent - The indentation on the lines we are writing. + - wrap_space - Whether or not to wrap only on spaces in the + information. + - split_char - A specific character to split the lines on. By default + spaces are used. + + """ + info_length = Record.GB_LINE_LENGTH - indent + + if not information: + # GenBank files use "." for missing data + return ".\n" + + if wrap_space: + info_parts = information.split(split_char) + else: + cur_pos = 0 + info_parts = [] + while cur_pos < len(information): + info_parts.append(information[cur_pos : cur_pos + info_length]) + cur_pos += info_length + + # first get the information string split up by line + output_parts = [] + cur_part = "" + for info_part in info_parts: + if len(cur_part) + 1 + len(info_part) > info_length: + if cur_part: + if split_char != " ": + cur_part += split_char + output_parts.append(cur_part) + cur_part = info_part + else: + if cur_part == "": + cur_part = info_part + else: + cur_part += split_char + info_part + + # add the last bit of information to the output + if cur_part: + output_parts.append(cur_part) + + # now format the information string for return + output_info = output_parts[0] + "\n" + for output_part in output_parts[1:]: + output_info += " " * indent + output_part + "\n" + + return output_info + + +def _indent_genbank(information, indent): + """Write out information with the specified indent (PRIVATE). + + Unlike _wrapped_genbank, this function makes no attempt to wrap + lines -- it assumes that the information already has newlines in the + appropriate places, and will add the specified indent to the start of + each line. + """ + # split the info into lines based on line breaks + info_parts = information.split("\n") + + # the first line will have no indent + output_info = info_parts[0] + "\n" + for info_part in info_parts[1:]: + output_info += " " * indent + info_part + "\n" + + return output_info + + +class Record: + """Hold GenBank information in a format similar to the original record. + + The Record class is meant to make data easy to get to when you are + just interested in looking at GenBank data. + + Attributes: + - locus - The name specified after the LOCUS keyword in the GenBank + record. This may be the accession number, or a clone id or something else. + - size - The size of the record. + - residue_type - The type of residues making up the sequence in this + record. Normally something like RNA, DNA or PROTEIN, but may be as + esoteric as 'ss-RNA circular'. + - data_file_division - The division this record is stored under in + GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) + - date - The date of submission of the record, in a form like '28-JUL-1998' + - accession - list of all accession numbers for the sequence. + - nid - Nucleotide identifier number. + - pid - Proteint identifier number + - version - The accession number + version (ie. AB01234.2) + - db_source - Information about the database the record came from + - gi - The NCBI gi identifier for the record. + - keywords - A list of keywords related to the record. + - segment - If the record is one of a series, this is info about which + segment this record is (something like '1 of 6'). + - source - The source of material where the sequence came from. + - organism - The genus and species of the organism (ie. 'Homo sapiens') + - taxonomy - A listing of the taxonomic classification of the organism, + starting general and getting more specific. + - references - A list of Reference objects. + - comment - Text with any kind of comment about the record. + - features - A listing of Features making up the feature table. + - base_counts - A string with the counts of bases for the sequence. + - origin - A string specifying info about the origin of the sequence. + - sequence - A string with the sequence itself. + - contig - A string of location information for a CONTIG in a RefSeq file + - project - The genome sequencing project numbers + (will be replaced by the dblink cross-references in 2009). + - dblinks - The genome sequencing project number(s) and other links. + (will replace the project information in 2009). + + """ + + # constants for outputting GenBank information + GB_LINE_LENGTH = 79 + GB_BASE_INDENT = 12 + GB_FEATURE_INDENT = 21 + GB_INTERNAL_INDENT = 2 + GB_OTHER_INTERNAL_INDENT = 3 + GB_FEATURE_INTERNAL_INDENT = 5 + GB_SEQUENCE_INDENT = 9 + + BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" + INTERNAL_FORMAT = ( + " " * GB_INTERNAL_INDENT + "%-" + str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" + ) + OTHER_INTERNAL_FORMAT = ( + " " * GB_OTHER_INTERNAL_INDENT + + "%-" + + str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + + "s" + ) + + BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" + INTERNAL_FEATURE_FORMAT = ( + " " * GB_FEATURE_INTERNAL_INDENT + + "%-" + + str(GB_FEATURE_INDENT - GB_FEATURE_INTERNAL_INDENT) + + "s" + ) + SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" + + def __init__(self): + """Initialize the class.""" + self.accession = [] + self.base_counts = "" + self.comment = "" + self.contig = "" + self.data_file_division = "" + self.date = "" + self.db_source = "" + self.dblinks = [] + self.definition = "" + self.features = [] + self.gi = "" + self.keywords = [] + self.locus = "" + self.molecule_type = "" + self.nid = "" + self.organism = "" + self.origin = "" + self.pid = "" + self.primary = [] + self.projects = [] + self.references = [] + self.residue_type = "" + self.segment = "" + self.sequence = "" + self.size = "" + self.source = "" + self.taxonomy = [] + self.topology = "" + self.version = "" + self.wgs = "" + self.wgs_scafld = [] + + def __str__(self): + """Provide a GenBank formatted output option for a Record. + + The objective of this is to provide an easy way to read in a GenBank + record, modify it somehow, and then output it in 'GenBank format.' + We are striving to make this work so that a parsed Record that is + output using this function will look exactly like the original + record. + + Much of the output is based on format description info at: + + ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt + """ + output = self._locus_line() + output += self._definition_line() + output += self._accession_line() + output += self._version_line() + output += self._project_line() + output += self._dblink_line() + output += self._nid_line() + output += self._pid_line() + output += self._keywords_line() + output += self._db_source_line() + output += self._segment_line() + output += self._source_line() + output += self._organism_line() + for reference in self.references: + output += str(reference) + output += self._comment_line() + output += self._features_line() + for feature in self.features: + output += str(feature) + output += self._base_count_line() + output += self._origin_line() + output += self._sequence_line() + output += self._wgs_line() + output += self._wgs_scafld_line() + output += self._contig_line() + output += "//" + return output + + def _locus_line(self): + """Provide the output string for the LOCUS line (PRIVATE).""" + output = "LOCUS" + output += " " * 7 # 6-12 spaces + output += "%-9s" % self.locus + output += " " # 22 space + output += "%7s" % self.size + if "PROTEIN" in self.residue_type: + output += " aa" + else: + output += " bp " + + # treat circular types differently, since they'll have long residue + # types + if "circular" in self.residue_type: + output += "%17s" % self.residue_type + # second case: ss-DNA types of records + elif "-" in self.residue_type: + output += "%7s" % self.residue_type + output += " " * 10 # spaces for circular + else: + output += " " * 3 # spaces for stuff like ss- + output += "%-4s" % self.residue_type + output += " " * 10 # spaces for circular + + output += " " * 2 + output += "%3s" % self.data_file_division + output += " " * 7 # spaces for 56-63 + output += "%11s" % self.date + output += "\n" + return output + + def _definition_line(self): + """Provide output for the DEFINITION line (PRIVATE).""" + output = Record.BASE_FORMAT % "DEFINITION" + output += _wrapped_genbank(self.definition + ".", Record.GB_BASE_INDENT) + return output + + def _accession_line(self): + """Output for the ACCESSION line (PRIVATE).""" + if self.accession: + output = Record.BASE_FORMAT % "ACCESSION" + + acc_info = "" + for accession in self.accession: + acc_info += "%s " % accession + # strip off an extra space at the end + acc_info = acc_info.rstrip() + output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) + else: + output = "" + + return output + + def _version_line(self): + """Output for the VERSION line (PRIVATE).""" + if self.version: + output = Record.BASE_FORMAT % "VERSION" + output += self.version + output += " GI:" + output += "%s\n" % self.gi + else: + output = "" + return output + + def _project_line(self): + output = "" + if len(self.projects) > 0: + output = Record.BASE_FORMAT % "PROJECT" + output += "%s\n" % " ".join(self.projects) + return output + + def _dblink_line(self): + output = "" + if len(self.dblinks) > 0: + output = Record.BASE_FORMAT % "DBLINK" + dblink_info = "\n".join(self.dblinks) + output += _wrapped_genbank(dblink_info, Record.GB_BASE_INDENT) + return output + + def _nid_line(self): + """Output for the NID line. Use of NID is obsolete in GenBank files (PRIVATE).""" + if self.nid: + output = Record.BASE_FORMAT % "NID" + output += "%s\n" % self.nid + else: + output = "" + return output + + def _pid_line(self): + """Output for PID line. Presumedly, PID usage is also obsolete (PRIVATE).""" + if self.pid: + output = Record.BASE_FORMAT % "PID" + output += "%s\n" % self.pid + else: + output = "" + return output + + def _keywords_line(self): + """Output for the KEYWORDS line (PRIVATE).""" + output = "" + if self.keywords: + output += Record.BASE_FORMAT % "KEYWORDS" + keyword_info = "" + for keyword in self.keywords: + keyword_info += "%s; " % keyword + # replace the ; at the end with a period + keyword_info = keyword_info[:-2] + keyword_info += "." + + output += _wrapped_genbank(keyword_info, Record.GB_BASE_INDENT) + + return output + + def _db_source_line(self): + """Output for DBSOURCE line (PRIVATE).""" + if self.db_source: + output = Record.BASE_FORMAT % "DBSOURCE" + output += "%s\n" % self.db_source + else: + output = "" + return output + + def _segment_line(self): + """Output for the SEGMENT line (PRIVATE).""" + output = "" + if self.segment: + output += Record.BASE_FORMAT % "SEGMENT" + output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) + return output + + def _source_line(self): + """Output for SOURCE line on where the sample came from (PRIVATE).""" + output = Record.BASE_FORMAT % "SOURCE" + output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) + return output + + def _organism_line(self): + """Output for ORGANISM line with taxonomy info (PRIVATE).""" + output = Record.INTERNAL_FORMAT % "ORGANISM" + # Now that species names can be too long, this line can wrap (Bug 2591) + output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) + output += " " * Record.GB_BASE_INDENT + taxonomy_info = "" + for tax in self.taxonomy: + taxonomy_info += "%s; " % tax + # replace the ; at the end with a period + taxonomy_info = taxonomy_info[:-2] + taxonomy_info += "." + output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) + + return output + + def _comment_line(self): + """Output for the COMMENT lines (PRIVATE).""" + output = "" + if self.comment: + output += Record.BASE_FORMAT % "COMMENT" + output += _indent_genbank(self.comment, Record.GB_BASE_INDENT) + return output + + def _features_line(self): + """Output for the FEATURES line (PRIVATE).""" + output = "" + if len(self.features) > 0: + output += Record.BASE_FEATURE_FORMAT % "FEATURES" + output += "Location/Qualifiers\n" + return output + + def _base_count_line(self): + """Output for the BASE COUNT line with base information (PRIVATE).""" + output = "" + if self.base_counts: + output += Record.BASE_FORMAT % "BASE COUNT " + # split up the base counts into their individual parts + count_parts = self.base_counts.split(" ") + while "" in count_parts: + count_parts.remove("") + # deal with the standard case, with a normal origin line + # like: 474 a 356 c 428 g 364 t + if len(count_parts) % 2 == 0: + while len(count_parts) > 0: + count_info = count_parts.pop(0) + count_type = count_parts.pop(0) + + output += "%7s %s" % (count_info, count_type) + # deal with ugly ORIGIN lines like: + # 1311257 a2224835 c2190093 g1309889 t + # by just outputting the raw information + else: + output += self.base_counts + output += "\n" + return output + + def _origin_line(self): + """Output for the ORIGIN line (PRIVATE).""" + output = "" + # only output the ORIGIN line if we have a sequence + if self.sequence: + output += Record.BASE_FORMAT % "ORIGIN" + if self.origin: + output += _wrapped_genbank(self.origin, Record.GB_BASE_INDENT) + else: + output += "\n" + return output + + def _sequence_line(self): + """Output for all of the sequence (PRIVATE).""" + output = "" + if self.sequence: + cur_seq_pos = 0 + while cur_seq_pos < len(self.sequence): + output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) + + for section in range(6): + start_pos = cur_seq_pos + section * 10 + end_pos = start_pos + 10 + seq_section = self.sequence[start_pos:end_pos] + output += " %s" % seq_section.lower() + + # stop looping if we are out of sequence + if end_pos > len(self.sequence): + break + + output += "\n" + cur_seq_pos += 60 + return output + + def _wgs_line(self): + output = "" + if self.wgs: + output += Record.BASE_FORMAT % "WGS" + output += self.wgs + return output + + def _wgs_scafld_line(self): + output = "" + if self.wgs_scafld: + output += Record.BASE_FORMAT % "WGS_SCAFLD" + output += self.wgs_scafld + return output + + def _contig_line(self): + """Output for CONTIG location information from RefSeq (PRIVATE).""" + output = "" + if self.contig: + output += Record.BASE_FORMAT % "CONTIG" + output += _wrapped_genbank( + self.contig, Record.GB_BASE_INDENT, split_char="," + ) + return output + + +class Reference: + """Hold information from a GenBank reference. + + Attributes: + - number - The number of the reference in the listing of references. + - bases - The bases in the sequence the reference refers to. + - authors - String with all of the authors. + - consrtm - Consortium the authors belong to. + - title - The title of the reference. + - journal - Information about the journal where the reference appeared. + - medline_id - The medline id for the reference. + - pubmed_id - The pubmed_id for the reference. + - remark - Free-form remarks about the reference. + + """ + + def __init__(self): + """Initialize the class.""" + self.number = "" + self.bases = "" + self.authors = "" + self.consrtm = "" + self.title = "" + self.journal = "" + self.medline_id = "" + self.pubmed_id = "" + self.remark = "" + + def __str__(self): + """Convert the reference to a GenBank format string.""" + output = self._reference_line() + output += self._authors_line() + output += self._consrtm_line() + output += self._title_line() + output += self._journal_line() + output += self._medline_line() + output += self._pubmed_line() + output += self._remark_line() + + return output + + def _reference_line(self): + """Output for REFERENCE lines (PRIVATE).""" + output = Record.BASE_FORMAT % "REFERENCE" + if self.number: + if self.bases: + output += "%-3s" % self.number + output += "%s" % self.bases + else: + output += "%s" % self.number + + output += "\n" + return output + + def _authors_line(self): + """Output for AUTHORS information (PRIVATE).""" + output = "" + if self.authors: + output += Record.INTERNAL_FORMAT % "AUTHORS" + output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) + return output + + def _consrtm_line(self): + """Output for CONSRTM information (PRIVATE).""" + output = "" + if self.consrtm: + output += Record.INTERNAL_FORMAT % "CONSRTM" + output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) + return output + + def _title_line(self): + """Output for TITLE information (PRIVATE).""" + output = "" + if self.title: + output += Record.INTERNAL_FORMAT % "TITLE" + output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) + return output + + def _journal_line(self): + """Output for JOURNAL information (PRIVATE).""" + output = "" + if self.journal: + output += Record.INTERNAL_FORMAT % "JOURNAL" + output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) + return output + + def _medline_line(self): + """Output for MEDLINE information (PRIVATE).""" + output = "" + if self.medline_id: + output += Record.INTERNAL_FORMAT % "MEDLINE" + output += self.medline_id + "\n" + return output + + def _pubmed_line(self): + """Output for PUBMED information (PRIVATE).""" + output = "" + if self.pubmed_id: + output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" + output += self.pubmed_id + "\n" + return output + + def _remark_line(self): + """Output for REMARK information (PRIVATE).""" + output = "" + if self.remark: + output += Record.INTERNAL_FORMAT % "REMARK" + output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) + return output + + +class Feature: + """Hold information about a Feature in the Feature Table of GenBank record. + + Attributes: + - key - The key name of the featue (ie. source) + - location - The string specifying the location of the feature. + - qualfiers - A list of Qualifier objects in the feature. + + """ + + def __init__(self, key="", location=""): + """Initialize the class.""" + self.key = key + self.location = location + self.qualifiers = [] + + def __repr__(self): + """Representation of the object for debugging or logging.""" + return "Feature(key=%r, location=%r)" % (self.key, self.location) + + def __str__(self): + """Return feature as a GenBank format string.""" + output = Record.INTERNAL_FEATURE_FORMAT % self.key + output += _wrapped_genbank( + self.location, Record.GB_FEATURE_INDENT, split_char="," + ) + for qualifier in self.qualifiers: + output += str(qualifier) + return output + + +class Qualifier: + """Hold information about a qualifier in a GenBank feature. + + Attributes: + - key - The key name of the qualifier (ie. /organism=) + - value - The value of the qualifier ("Dictyostelium discoideum"). + + """ + + def __init__(self, key="", value=""): + """Initialize the class.""" + self.key = key + self.value = value + + def __repr__(self): + """Representation of the object for debugging or logging.""" + return "Qualifier(key=%r, value=%r)" % (self.key, self.value) + + def __str__(self): + """Return feature qualifier as a GenBank format string.""" + output = " " * Record.GB_FEATURE_INDENT + # determine whether we can wrap on spaces + space_wrap = 1 + for no_space_key in Bio.GenBank._BaseGenBankConsumer.remove_space_keys: + if no_space_key in self.key: + space_wrap = 0 + # return double quotes as-is, leave it to the user to escape them + return output + _wrapped_genbank( + self.key + self.value, Record.GB_FEATURE_INDENT, space_wrap + ) diff --git a/code/lib/Bio/GenBank/Scanner.py b/code/lib/Bio/GenBank/Scanner.py new file mode 100644 index 0000000..2d94b4c --- /dev/null +++ b/code/lib/Bio/GenBank/Scanner.py @@ -0,0 +1,1904 @@ +# Copyright 2007-2017 by Peter Cock. All rights reserved. +# Revisions copyright 2010 by Uri Laserson. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Internal code for parsing GenBank and EMBL files (PRIVATE). + +This code is NOT intended for direct use. It provides a basic scanner +(for use with a event consumer such as Bio.GenBank._FeatureConsumer) +to parse a GenBank or EMBL file (with their shared INSDC feature table). + +It is used by Bio.GenBank to parse GenBank files +It is also used by Bio.SeqIO to parse GenBank and EMBL files + +Feature Table Documentation: + +- http://www.insdc.org/files/feature_table.html +- http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html +- ftp://ftp.ncbi.nih.gov/genbank/docs/ +""" +# 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. +# These are GenBank files that summarize the content of a project, and provide lists of +# scaffold and contig files in the project. These will be in annotations['wgs'] and +# annotations['wgs_scafld']. These GenBank files do not have sequences. See +# http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 +# http://is.gd/nNgk +# for more details of this format, and an example. +# Added by Ying Huang & Iddo Friedberg + + +import warnings +import re +import sys +from collections import OrderedDict + +from Bio.File import as_handle +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio import BiopythonParserWarning + + +class InsdcScanner: + """Basic functions for breaking up a GenBank/EMBL file into sub sections. + + The International Nucleotide Sequence Database Collaboration (INSDC) + between the DDBJ, EMBL, and GenBank. These organisations all use the + same "Feature Table" layout in their plain text flat file formats. + + However, the header and sequence sections of an EMBL file are very + different in layout to those produced by GenBank/DDBJ. + """ + + # These constants get redefined with sensible values in the sub classes: + RECORD_START = "XXX" # "LOCUS " or "ID " + HEADER_WIDTH = 3 # 12 or 5 + FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] + FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] + FEATURE_QUALIFIER_INDENT = 0 + FEATURE_QUALIFIER_SPACER = "" + SEQUENCE_HEADERS = ["XXX"] # with right hand side spaces removed + + def __init__(self, debug=0): + """Initialize the class.""" + assert len(self.RECORD_START) == self.HEADER_WIDTH + for marker in self.SEQUENCE_HEADERS: + assert marker == marker.rstrip() + assert len(self.FEATURE_QUALIFIER_SPACER) == self.FEATURE_QUALIFIER_INDENT + self.debug = debug + self.handle = None + self.line = None + + def set_handle(self, handle): + """Set the handle attribute.""" + self.handle = handle + self.line = "" + + def find_start(self): + """Read in lines until find the ID/LOCUS line, which is returned. + + Any preamble (such as the header used by the NCBI on ``*.seq.gz`` archives) + will we ignored. + """ + while True: + if self.line: + line = self.line + self.line = "" + else: + line = self.handle.readline() + if not line: + if self.debug: + print("End of file") + return None + if isinstance(line[0], int): + # Same exception as for FASTQ files + raise ValueError("Is this handle in binary mode not text mode?") + if line[: self.HEADER_WIDTH] == self.RECORD_START: + if self.debug > 1: + print("Found the start of a record:\n" + line) + break + line = line.rstrip() + if line == "//": + if self.debug > 1: + print("Skipping // marking end of last record") + elif line == "": + if self.debug > 1: + print("Skipping blank line before record") + else: + # Ignore any header before the first ID/LOCUS line. + if self.debug > 1: + print("Skipping header line before record:\n" + line) + self.line = line + return line + + def parse_header(self): + """Return list of strings making up the header. + + New line characters are removed. + + Assumes you have just read in the ID/LOCUS line. + """ + if self.line[: self.HEADER_WIDTH] != self.RECORD_START: + raise ValueError("Not at start of record") + + header_lines = [] + while True: + line = self.handle.readline() + if not line: + raise ValueError("Premature end of line during sequence data") + line = line.rstrip() + if line in self.FEATURE_START_MARKERS: + if self.debug: + print("Found feature table") + break + # if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: + # if self.debug : print("Found header table (?)") + # break + if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: + if self.debug: + print("Found start of sequence") + break + if line == "//": + raise ValueError("Premature end of sequence data marker '//' found") + header_lines.append(line) + self.line = line + return header_lines + + def parse_features(self, skip=False): + """Return list of tuples for the features (if present). + + Each feature is returned as a tuple (key, location, qualifiers) + where key and location are strings (e.g. "CDS" and + "complement(join(490883..490885,1..879))") while qualifiers + is a list of two string tuples (feature qualifier keys and values). + + Assumes you have already read to the start of the features table. + """ + if self.line.rstrip() not in self.FEATURE_START_MARKERS: + if self.debug: + print("Didn't find any feature table") + return [] + + while self.line.rstrip() in self.FEATURE_START_MARKERS: + self.line = self.handle.readline() + + features = [] + line = self.line + while True: + if not line: + raise ValueError("Premature end of line during features table") + if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: + if self.debug: + print("Found start of sequence") + break + line = line.rstrip() + if line == "//": + raise ValueError("Premature end of features table, marker '//' found") + if line in self.FEATURE_END_MARKERS: + if self.debug: + print("Found end of features") + line = self.handle.readline() + break + if line[2 : self.FEATURE_QUALIFIER_INDENT].strip() == "": + # This is an empty feature line between qualifiers. Empty + # feature lines within qualifiers are handled below (ignored). + line = self.handle.readline() + continue + if len(line) < self.FEATURE_QUALIFIER_INDENT: + warnings.warn( + "line too short to contain a feature: %r" % line, + BiopythonParserWarning, + ) + line = self.handle.readline() + continue + + if skip: + line = self.handle.readline() + while ( + line[: self.FEATURE_QUALIFIER_INDENT] + == self.FEATURE_QUALIFIER_SPACER + ): + line = self.handle.readline() + else: + # Build up a list of the lines making up this feature: + if ( + line[self.FEATURE_QUALIFIER_INDENT] != " " + and " " in line[self.FEATURE_QUALIFIER_INDENT :] + ): + # The feature table design enforces a length limit on the feature keys. + # Some third party files (e.g. IGMT's EMBL like files) solve this by + # over indenting the location and qualifiers. + feature_key, line = line[2:].strip().split(None, 1) + feature_lines = [line] + warnings.warn( + "Over indented %s feature?" % feature_key, + BiopythonParserWarning, + ) + else: + feature_key = line[2 : self.FEATURE_QUALIFIER_INDENT].strip() + feature_lines = [line[self.FEATURE_QUALIFIER_INDENT :]] + line = self.handle.readline() + while line[ + : self.FEATURE_QUALIFIER_INDENT + ] == self.FEATURE_QUALIFIER_SPACER or ( + line != "" and line.rstrip() == "" + ): # cope with blank lines in the midst of a feature + # Use strip to remove any harmless trailing white space AND and leading + # white space (e.g. out of spec files with too much indentation) + feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT :].strip()) + line = self.handle.readline() + features.append(self.parse_feature(feature_key, feature_lines)) + self.line = line + return features + + def parse_feature(self, feature_key, lines): + r"""Parse a feature given as a list of strings into a tuple. + + Expects a feature as a list of strings, returns a tuple (key, location, + qualifiers) + + For example given this GenBank feature:: + + CDS complement(join(490883..490885,1..879)) + /locus_tag="NEQ001" + /note="conserved hypothetical [Methanococcus jannaschii]; + COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear + localization signal; IPR002743: Protein of unknown + function DUF57" + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="NP_963295.1" + /db_xref="GI:41614797" + /db_xref="GeneID:2732620" + /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK + EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK + KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP + IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE + EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS + LNSMGFGFVNTKKNSAR" + + Then should give input key="CDS" and the rest of the data as a list of strings + lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] + where the leading spaces and trailing newlines have been removed. + + Returns tuple containing: (key as string, location string, qualifiers as list) + as follows for this example: + + key = "CDS", string + location = "complement(join(490883..490885,1..879))", string + qualifiers = list of string tuples: + + [('locus_tag', '"NEQ001"'), + ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), + ('codon_start', '1'), + ('transl_table', '11'), + ('product', '"hypothetical protein"'), + ('protein_id', '"NP_963295.1"'), + ('db_xref', '"GI:41614797"'), + ('db_xref', '"GeneID:2732620"'), + ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] + + In the above example, the "note" and "translation" were edited for compactness, + and they would contain multiple new line characters (displayed above as \n) + + If a qualifier is quoted (in this case, everything except codon_start and + transl_table) then the quotes are NOT removed. + + Note that no whitespace is removed. + """ + # Skip any blank lines + iterator = (x for x in lines if x) + try: + line = next(iterator) + + feature_location = line.strip() + while feature_location[-1:] == ",": + # Multiline location, still more to come! + line = next(iterator) + feature_location += line.strip() + if feature_location.count("(") > feature_location.count(")"): + # Including the prev line in warning would be more explicit, + # but this way get one-and-only-one warning shown by default: + warnings.warn( + "Non-standard feature line wrapping (didn't break on comma)?", + BiopythonParserWarning, + ) + while feature_location[-1:] == "," or feature_location.count( + "(" + ) > feature_location.count(")"): + line = next(iterator) + feature_location += line.strip() + + qualifiers = [] + + for line_number, line in enumerate(iterator): + # check for extra wrapping of the location closing parentheses + if line_number == 0 and line.startswith(")"): + feature_location += line.strip() + elif line[0] == "/": + # New qualifier + i = line.find("=") + key = line[1:i] # does not work if i==-1 + value = line[i + 1 :] # we ignore 'value' if i==-1 + if i and value.startswith(" ") and value.lstrip().startswith('"'): + warnings.warn( + "White space after equals in qualifier", + BiopythonParserWarning, + ) + value = value.lstrip() + if i == -1: + # Qualifier with no key, e.g. /pseudo + key = line[1:] + qualifiers.append((key, None)) + elif not value: + # ApE can output /note= + qualifiers.append((key, "")) + elif value == '"': + # One single quote + if self.debug: + print("Single quote %s:%s" % (key, value)) + # DO NOT remove the quote... + qualifiers.append((key, value)) + elif value[0] == '"': + # Quoted... + value_list = [value] + while value_list[-1][-1] != '"': + value_list.append(next(iterator)) + value = "\n".join(value_list) + # DO NOT remove the quotes... + qualifiers.append((key, value)) + else: + # Unquoted + # if debug : print("Unquoted line %s:%s" % (key,value)) + qualifiers.append((key, value)) + else: + # Unquoted continuation + assert len(qualifiers) > 0 + assert key == qualifiers[-1][0] + # if debug : print("Unquoted Cont %s:%s" % (key, line)) + if qualifiers[-1][1] is None: + raise StopIteration + qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) + return feature_key, feature_location, qualifiers + except StopIteration: + # Bummer + raise ValueError( + "Problem with '%s' feature:\n%s" % (feature_key, "\n".join(lines)) + ) from None + + def parse_footer(self): + """Return a tuple containing a list of any misc strings, and the sequence.""" + # This is a basic bit of code to scan and discard the sequence, + # which was useful when developing the sub classes. + if self.line in self.FEATURE_END_MARKERS: + while self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS: + self.line = self.handle.readline() + if not self.line: + raise ValueError("Premature end of file") + self.line = self.line.rstrip() + + if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS: + raise ValueError("Not at start of sequence") + while True: + line = self.handle.readline() + if not line: + raise ValueError("Premature end of line during sequence data") + line = line.rstrip() + if line == "//": + break + self.line = line + return [], "" # Dummy values! + + def _feed_first_line(self, consumer, line): + """Handle the LOCUS/ID line, passing data to the comsumer (PRIVATE). + + This should be implemented by the EMBL / GenBank specific subclass + + Used by the parse_records() and parse() methods. + """ + pass + + def _feed_header_lines(self, consumer, lines): + """Handle the header lines (list of strings), passing data to the comsumer (PRIVATE). + + This should be implemented by the EMBL / GenBank specific subclass + + Used by the parse_records() and parse() methods. + """ + pass + + @staticmethod + def _feed_feature_table(consumer, feature_tuples): + """Handle the feature table (list of tuples), passing data to the comsumer (PRIVATE). + + Used by the parse_records() and parse() methods. + """ + consumer.start_feature_table() + for feature_key, location_string, qualifiers in feature_tuples: + consumer.feature_key(feature_key) + consumer.location(location_string) + for q_key, q_value in qualifiers: + if q_value is None: + consumer.feature_qualifier(q_key, q_value) + else: + consumer.feature_qualifier(q_key, q_value.replace("\n", " ")) + + def _feed_misc_lines(self, consumer, lines): + """Handle any lines between features and sequence (list of strings), passing data to the consumer (PRIVATE). + + This should be implemented by the EMBL / GenBank specific subclass + + Used by the parse_records() and parse() methods. + """ + pass + + def feed(self, handle, consumer, do_features=True): + """Feed a set of data into the consumer. + + This method is intended for use with the "old" code in Bio.GenBank + + Arguments: + - handle - A handle with the information to parse. + - consumer - The consumer that should be informed of events. + - do_features - Boolean, should the features be parsed? + Skipping the features can be much faster. + + Return values: + - true - Passed a record + - false - Did not find a record + + """ + # Should work with both EMBL and GenBank files provided the + # equivalent Bio.GenBank._FeatureConsumer methods are called... + self.set_handle(handle) + if not self.find_start(): + # Could not find (another) record + consumer.data = None + return False + + # We use the above class methods to parse the file into a simplified format. + # The first line, header lines and any misc lines after the features will be + # dealt with by GenBank / EMBL specific derived classes. + + # First line and header: + self._feed_first_line(consumer, self.line) + self._feed_header_lines(consumer, self.parse_header()) + + # Features (common to both EMBL and GenBank): + if do_features: + self._feed_feature_table(consumer, self.parse_features(skip=False)) + else: + self.parse_features(skip=True) # ignore the data + + # Footer and sequence + misc_lines, sequence_string = self.parse_footer() + self._feed_misc_lines(consumer, misc_lines) + + consumer.sequence(sequence_string) + # Calls to consumer.base_number() do nothing anyway + consumer.record_end("//") + + assert self.line == "//" + + # And we are done + return True + + def parse(self, handle, do_features=True): + """Return a SeqRecord (with SeqFeatures if do_features=True). + + See also the method parse_records() for use on multi-record files. + """ + from Bio.GenBank import _FeatureConsumer + from Bio.GenBank.utils import FeatureValueCleaner + + consumer = _FeatureConsumer( + use_fuzziness=1, feature_cleaner=FeatureValueCleaner() + ) + + if self.feed(handle, consumer, do_features): + return consumer.data + else: + return None + + def parse_records(self, handle, do_features=True): + """Parse records, return a SeqRecord object iterator. + + Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord + + The SeqRecord objects include SeqFeatures if do_features=True + + This method is intended for use in Bio.SeqIO + """ + # This is a generator function + with as_handle(handle) as handle: + while True: + record = self.parse(handle, do_features) + if record is None: + break + if record.id is None: + raise ValueError( + "Failed to parse the record's ID. Invalid ID line?" + ) + if record.name == "": + raise ValueError( + "Failed to parse the record's name. Invalid ID line?" + ) + if record.description == "": + raise ValueError("Failed to parse the record's description") + yield record + + def parse_cds_features( + self, handle, alphabet=None, tags2id=("protein_id", "locus_tag", "product"), + ): + """Parse CDS features, return SeqRecord object iterator. + + Each CDS feature becomes a SeqRecord. + + Arguments: + - alphabet - Obsolete, should be left as None. + - tags2id - Tupple of three strings, the feature keys to use + for the record id, name and description, + + This method is intended for use in Bio.SeqIO + + """ + if alphabet is not None: + raise ValueError("The alphabet argument is no longer supported") + with as_handle(handle) as handle: + self.set_handle(handle) + while self.find_start(): + # Got an EMBL or GenBank record... + self.parse_header() # ignore header lines! + feature_tuples = self.parse_features() + # self.parse_footer() # ignore footer lines! + while True: + line = self.handle.readline() + if not line: + break + if line[:2] == "//": + break + self.line = line.rstrip() + + # Now go though those features... + for key, location_string, qualifiers in feature_tuples: + if key == "CDS": + # Create SeqRecord + # ================ + # SeqRecord objects cannot be created with annotations, they + # must be added afterwards. So create an empty record and + # then populate it: + record = SeqRecord(seq=None) + annotations = record.annotations + annotations["molecule_type"] = "protein" + # Should we add a location object to the annotations? + # I *think* that only makes sense for SeqFeatures with their + # sub features... + annotations["raw_location"] = location_string.replace(" ", "") + + for (qualifier_name, qualifier_data) in qualifiers: + if ( + qualifier_data is not None + and qualifier_data[0] == '"' + and qualifier_data[-1] == '"' + ): + # Remove quotes + qualifier_data = qualifier_data[1:-1] + # Append the data to the annotation qualifier... + if qualifier_name == "translation": + assert record.seq is None, "Multiple translations!" + record.seq = Seq(qualifier_data.replace("\n", "")) + elif qualifier_name == "db_xref": + # its a list, possibly empty. Its safe to extend + record.dbxrefs.append(qualifier_data) + else: + if qualifier_data is not None: + qualifier_data = qualifier_data.replace( + "\n", " " + ).replace(" ", " ") + try: + annotations[qualifier_name] += " " + qualifier_data + except KeyError: + # Not an addition to existing data, its the first bit + annotations[qualifier_name] = qualifier_data + + # Fill in the ID, Name, Description + # ================================= + try: + record.id = annotations[tags2id[0]] + except KeyError: + pass + try: + record.name = annotations[tags2id[1]] + except KeyError: + pass + try: + record.description = annotations[tags2id[2]] + except KeyError: + pass + + yield record + + +class EmblScanner(InsdcScanner): + """For extracting chunks of information in EMBL files.""" + + RECORD_START = "ID " + HEADER_WIDTH = 5 + FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"] + FEATURE_END_MARKERS = ["XX"] # XX can also mark the end of many things! + FEATURE_QUALIFIER_INDENT = 21 + FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2) + SEQUENCE_HEADERS = ["SQ", "CO"] # Remove trailing spaces + + EMBL_INDENT = HEADER_WIDTH + EMBL_SPACER = " " * EMBL_INDENT + + def parse_footer(self): + """Return a tuple containing a list of any misc strings, and the sequence.""" + if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS: + raise ValueError("Footer format unexpected: '%s'" % self.line) + + # Note that the SQ line can be split into several lines... + misc_lines = [] + while self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: + misc_lines.append(self.line) + self.line = self.handle.readline() + if not self.line: + raise ValueError("Premature end of file") + self.line = self.line.rstrip() + + if not ( + self.line[: self.HEADER_WIDTH] == " " * self.HEADER_WIDTH + or self.line.strip() == "//" + ): + raise ValueError("Unexpected content after SQ or CO line: %r" % self.line) + + seq_lines = [] + line = self.line + while True: + if not line: + raise ValueError("Premature end of file in sequence data") + line = line.strip() + if not line: + raise ValueError("Blank line in sequence data") + if line == "//": + break + if self.line[: self.HEADER_WIDTH] != (" " * self.HEADER_WIDTH): + raise ValueError( + "Problem with characters in header line, " + " or incorrect header width: " + self.line + ) + # Remove tailing number now, remove spaces later + linersplit = line.rsplit(None, 1) + if len(linersplit) == 2 and linersplit[1].isdigit(): + seq_lines.append(linersplit[0]) + elif line.isdigit(): + # Special case of final blank line with no bases + # just the sequence coordinate + pass + else: + warnings.warn( + "EMBL sequence line missing coordinates", BiopythonParserWarning + ) + seq_lines.append(line) + line = self.handle.readline() + self.line = line + return misc_lines, "".join(seq_lines).replace(" ", "") + + def _feed_first_line(self, consumer, line): + assert line[: self.HEADER_WIDTH].rstrip() == "ID" + if line[self.HEADER_WIDTH :].count(";") == 6: + # Looks like the semi colon separated style introduced in 2006 + self._feed_first_line_new(consumer, line) + elif line[self.HEADER_WIDTH :].count(";") == 3: + if line.rstrip().endswith(" SQ"): + # EMBL-bank patent data + self._feed_first_line_patents(consumer, line) + else: + # Looks like the pre 2006 style + self._feed_first_line_old(consumer, line) + elif line[self.HEADER_WIDTH :].count(";") == 2: + # Looks like KIKO patent data + self._feed_first_line_patents_kipo(consumer, line) + else: + raise ValueError("Did not recognise the ID line layout:\n" + line) + + def _feed_first_line_patents(self, consumer, line): + # Old style EMBL patent records where ID line ended SQ + # Not 100% sure that PRT here is really molecule type and + # not the data file division... + # + # Either Non-Redundant Level 1 database records, + # ID ; ; ; + # e.g. ID NRP_AX000635; PRT; NR1; 15 SQ + # + # Or, Non-Redundant Level 2 database records: + # ID ; ; ; + # e.g. ID NRP0000016E; PRT; NR2; 5 SQ + # e.g. ID NRP_AX000635; PRT; NR1; 15 SQ + fields = [ + data.strip() for data in line[self.HEADER_WIDTH :].strip()[:-3].split(";") + ] + assert len(fields) == 4 + consumer.locus(fields[0]) + consumer.residue_type(fields[1]) # semi-redundant + consumer.data_file_division(fields[2]) + # TODO - Record cluster size? + + def _feed_first_line_patents_kipo(self, consumer, line): + # EMBL format patent sequence from KIPO, e.g. + # ftp://ftp.ebi.ac.uk/pub/databases/patentdata/kipo_prt.dat.gz + # + # e.g. ID DI500001 STANDARD; PRT; 111 AA. + # + # This follows the style of _feed_first_line_old + assert line[: self.HEADER_WIDTH].rstrip() == "ID" + fields = [line[self.HEADER_WIDTH :].split(None, 1)[0]] + fields.extend(line[self.HEADER_WIDTH :].split(None, 1)[1].split(";")) + fields = [entry.strip() for entry in fields] + """ + The tokens represent: + + 0. Primary accession number + (space sep) + 1. ??? (e.g. standard) + (semi-colon) + 2. Molecule type (protein)? Division? Always 'PRT' + 3. Sequence length (e.g. '111 AA.') + """ + consumer.locus(fields[0]) # Should we also call the accession consumer? + # consumer.molecule_type(fields[2]) + self._feed_seq_length(consumer, fields[3]) + + def _feed_first_line_old(self, consumer, line): + # Expects an ID line in the style before 2006, e.g. + # ID SC10H5 standard; DNA; PRO; 4870 BP. + # ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. + assert line[: self.HEADER_WIDTH].rstrip() == "ID" + fields = [line[self.HEADER_WIDTH :].split(None, 1)[0]] + fields.extend(line[self.HEADER_WIDTH :].split(None, 1)[1].split(";")) + fields = [entry.strip() for entry in fields] + """ + The tokens represent: + + 0. Primary accession number + (space sep) + 1. ??? (e.g. standard) + (semi-colon) + 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') + 3. Taxonomic division (e.g. 'PRO') + 4. Sequence length (e.g. '4639675 BP.') + + """ + consumer.locus(fields[0]) # Should we also call the accession consumer? + consumer.residue_type(fields[2]) + if "circular" in fields[2]: + consumer.topology("circular") + consumer.molecule_type(fields[2].replace("circular", "").strip()) + elif "linear" in fields[2]: + consumer.topology("linear") + consumer.molecule_type(fields[2].replace("linear", "").strip()) + else: + consumer.molecule_type(fields[2].strip()) + consumer.data_file_division(fields[3]) + self._feed_seq_length(consumer, fields[4]) + + def _feed_first_line_new(self, consumer, line): + # Expects an ID line in the style introduced in 2006, e.g. + # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. + # ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. + assert line[: self.HEADER_WIDTH].rstrip() == "ID" + fields = [data.strip() for data in line[self.HEADER_WIDTH :].strip().split(";")] + assert len(fields) == 7 + """ + The tokens represent: + + 0. Primary accession number + 1. Sequence version number + 2. Topology: 'circular' or 'linear' + 3. Molecule type (e.g. 'genomic DNA') + 4. Data class (e.g. 'STD') + 5. Taxonomic division (e.g. 'PRO') + 6. Sequence length (e.g. '4639675 BP.') + + """ + + consumer.locus(fields[0]) + + # Call the accession consumer now, to make sure we record + # something as the record.id, in case there is no AC line + consumer.accession(fields[0]) + + # TODO - How to deal with the version field? At the moment the consumer + # will try and use this for the ID which isn't ideal for EMBL files. + version_parts = fields[1].split() + if ( + len(version_parts) == 2 + and version_parts[0] == "SV" + and version_parts[1].isdigit() + ): + consumer.version_suffix(version_parts[1]) + + # Based on how the old GenBank parser worked, merge these two: + consumer.residue_type(" ".join(fields[2:4])) # Semi-obsolete + + consumer.topology(fields[2]) + consumer.molecule_type(fields[3]) + + # consumer.xxx(fields[4]) # TODO - What should we do with the data class? + + consumer.data_file_division(fields[5]) + + self._feed_seq_length(consumer, fields[6]) + + @staticmethod + def _feed_seq_length(consumer, text): + length_parts = text.split() + assert len(length_parts) == 2, "Invalid sequence length string %r" % text + assert length_parts[1].upper() in ["BP", "BP.", "AA", "AA."] + consumer.size(length_parts[0]) + + def _feed_header_lines(self, consumer, lines): + consumer_dict = { + "AC": "accession", + "SV": "version", # SV line removed in June 2006, now part of ID line + "DE": "definition", + # 'RN' : 'reference_num', + # 'RC' : reference comment... TODO + # 'RP' : 'reference_bases', + # 'RX' : reference cross reference... DOI or Pubmed + "RG": "consrtm", # optional consortium + # 'RA' : 'authors', + # 'RT' : 'title', + "RL": "journal", + "OS": "organism", + "OC": "taxonomy", + # 'DR' : data reference + "CC": "comment", + # 'XX' : splitter + } + # We have to handle the following specially: + # RX (depending on reference type...) + for line in lines: + line_type = line[: self.EMBL_INDENT].strip() + data = line[self.EMBL_INDENT :].strip() + if line_type == "XX": + pass + elif line_type == "RN": + # Reformat reference numbers for the GenBank based consumer + # e.g. '[1]' becomes '1' + if data[0] == "[" and data[-1] == "]": + data = data[1:-1] + consumer.reference_num(data) + elif line_type == "RP": + if data.strip() == "[-]": + # Patent EMBL files from KIPO just use: RN [-] + pass + else: + # Reformat reference numbers for the GenBank based consumer + # e.g. '1-4639675' becomes '(bases 1 to 4639675)' + # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' + # Note could be multi-line, and end with a comma + parts = [ + bases.replace("-", " to ").strip() + for bases in data.split(",") + if bases.strip() + ] + consumer.reference_bases("(bases %s)" % "; ".join(parts)) + elif line_type == "RT": + # Remove the enclosing quotes and trailing semi colon. + # Note the title can be split over multiple lines. + if data.startswith('"'): + data = data[1:] + if data.endswith('";'): + data = data[:-2] + consumer.title(data) + elif line_type == "RX": + # EMBL support three reference types at the moment: + # - PUBMED PUBMED bibliographic database (NLM) + # - DOI Digital Object Identifier (International DOI Foundation) + # - AGRICOLA US National Agriculture Library (NAL) of the US Department + # of Agriculture (USDA) + # + # Format: + # RX resource_identifier; identifier. + # + # e.g. + # RX DOI; 10.1016/0024-3205(83)90010-3. + # RX PUBMED; 264242. + # + # Currently our reference object only supports PUBMED and MEDLINE + # (as these were in GenBank files?). + key, value = data.split(";", 1) + if value.endswith("."): + value = value[:-1] + value = value.strip() + if key == "PUBMED": + consumer.pubmed_id(value) + # TODO - Handle other reference types (here and in BioSQL bindings) + elif line_type == "CC": + # Have to pass a list of strings for this one (not just a string) + consumer.comment([data]) + elif line_type == "DR": + # Database Cross-reference, format: + # DR database_identifier; primary_identifier; secondary_identifier. + # + # e.g. + # DR MGI; 98599; Tcrb-V4. + # + # TODO - How should we store any secondary identifier? + parts = data.rstrip(".").split(";") + # Turn it into "database_identifier:primary_identifier" to + # mimic the GenBank parser. e.g. "MGI:98599" + if len(parts) == 1: + warnings.warn( + "Malformed DR line in EMBL file.", BiopythonParserWarning + ) + else: + consumer.dblink("%s:%s" % (parts[0].strip(), parts[1].strip())) + elif line_type == "RA": + # Remove trailing ; at end of authors list + consumer.authors(data.rstrip(";")) + elif line_type == "PR": + # In the EMBL patent files, this is a PR (PRiority) line which + # provides the earliest active priority within the family. + # The priority number comes first, followed by the priority date. + # + # e.g. + # PR JP19990377484 16-DEC-1999 + # + # However, in most EMBL files this is a PR (PRoject) line which + # gives the BioProject reference number. + # + # e.g. + # PR Project:PRJNA60715; + # + # In GenBank files this corresponds to the old PROJECT line + # which was later replaced with the DBLINK line. + if data.startswith("Project:"): + # Remove trailing ; at end of the project reference + consumer.project(data.rstrip(";")) + elif line_type == "KW": + consumer.keywords(data.rstrip(";")) + elif line_type in consumer_dict: + # Its a semi-automatic entry! + getattr(consumer, consumer_dict[line_type])(data) + else: + if self.debug: + print("Ignoring EMBL header line:\n%s" % line) + + def _feed_misc_lines(self, consumer, lines): + # TODO - Should we do something with the information on the SQ line(s)? + lines.append("") + line_iter = iter(lines) + try: + for line in line_iter: + if line.startswith("CO "): + line = line[5:].strip() + contig_location = line + while True: + line = next(line_iter) + if not line: + break + elif line.startswith("CO "): + # Don't need to preseve the whitespace here. + contig_location += line[5:].strip() + else: + raise ValueError( + "Expected CO (contig) continuation line, got:\n" + line + ) + consumer.contig_location(contig_location) + if line.startswith("SQ Sequence "): + # e.g. + # SQ Sequence 219 BP; 82 A; 48 C; 33 G; 45 T; 11 other; + # + # Or, EMBL-bank patent, e.g. + # SQ Sequence 465 AA; 3963407aa91d3a0d622fec679a4524e0; MD5; + self._feed_seq_length( + consumer, line[14:].rstrip().rstrip(";").split(";", 1)[0] + ) + # TODO - Record the checksum etc? + return + except StopIteration: + raise ValueError("Problem in misc lines before sequence") from None + + +class _ImgtScanner(EmblScanner): + """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). + + IMGT files are like EMBL files but in order to allow longer feature types + the features should be indented by 25 characters not 21 characters. In + practice the IMGT flat files tend to use either 21 or 25 characters, so we + must cope with both. + + This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. + """ + + FEATURE_START_MARKERS = [ + "FH Key Location/Qualifiers", + "FH Key Location/Qualifiers (from EMBL)", + "FH Key Location/Qualifiers", + "FH", + ] + + def _feed_first_line(self, consumer, line): + assert line[: self.HEADER_WIDTH].rstrip() == "ID" + if line[self.HEADER_WIDTH :].count(";") != 5: + # Assume its an older EMBL-like line, + return EmblScanner._feed_first_line(self, consumer, line) + # Otherwise assume its the new (circa 2016) IMGT style + # as used in the IPD-IMGT/HLA Database + # + # https://github.com/ANHIG/IMGTHLA/ + # + # The key changes post 3.16 are the addition of an SV value + # to the ID line, these additions should make the format more + # similar to the ENA style. + # + # ID HLA00001 standard; DNA; HUM; 3503 BP. + # + # becomes + # + # ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP. + fields = [data.strip() for data in line[self.HEADER_WIDTH :].strip().split(";")] + assert len(fields) == 6 + """ + The tokens represent: + + 0. Primary accession number (eg 'HLA00001') + 1. Sequence version number (eg 'SV 1') + 2. ??? eg 'standard' + 3. Molecule type (e.g. 'DNA') + 4. Taxonomic division (e.g. 'HUM') + 5. Sequence length (e.g. '3503 BP.') + """ + consumer.locus(fields[0]) + + # See TODO on the EMBL _feed_first_line_new about version field + version_parts = fields[1].split() + if ( + len(version_parts) == 2 + and version_parts[0] == "SV" + and version_parts[1].isdigit() + ): + consumer.version_suffix(version_parts[1]) + + consumer.residue_type(fields[3]) + if "circular" in fields[3]: + consumer.topology("circular") + consumer.molecule_type(fields[3].replace("circular", "").strip()) + elif "linear" in fields[3]: + consumer.topology("linear") + consumer.molecule_type(fields[3].replace("linear", "").strip()) + else: + consumer.molecule_type(fields[3].strip()) + consumer.data_file_division(fields[4]) + self._feed_seq_length(consumer, fields[5]) + + def parse_features(self, skip=False): + """Return list of tuples for the features (if present). + + Each feature is returned as a tuple (key, location, qualifiers) + where key and location are strings (e.g. "CDS" and + "complement(join(490883..490885,1..879))") while qualifiers + is a list of two string tuples (feature qualifier keys and values). + + Assumes you have already read to the start of the features table. + """ + if self.line.rstrip() not in self.FEATURE_START_MARKERS: + if self.debug: + print("Didn't find any feature table") + return [] + + while self.line.rstrip() in self.FEATURE_START_MARKERS: + self.line = self.handle.readline() + + bad_position_re = re.compile(r"([0-9]+)>") + + features = [] + line = self.line + while True: + if not line: + raise ValueError("Premature end of line during features table") + if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: + if self.debug: + print("Found start of sequence") + break + line = line.rstrip() + if line == "//": + raise ValueError("Premature end of features table, marker '//' found") + if line in self.FEATURE_END_MARKERS: + if self.debug: + print("Found end of features") + line = self.handle.readline() + break + if line[2 : self.FEATURE_QUALIFIER_INDENT].strip() == "": + # This is an empty feature line between qualifiers. Empty + # feature lines within qualifiers are handled below (ignored). + line = self.handle.readline() + continue + + if skip: + line = self.handle.readline() + while ( + line[: self.FEATURE_QUALIFIER_INDENT] + == self.FEATURE_QUALIFIER_SPACER + ): + line = self.handle.readline() + else: + assert line[:2] == "FT" + try: + feature_key, location_start = line[2:].strip().split() + except ValueError: + # e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" + # Assume indent of 25 as per IMGT spec, with the location + # start in column 26 (one-based). + feature_key = line[2:25].strip() + location_start = line[25:].strip() + feature_lines = [location_start] + line = self.handle.readline() + while ( + line[: self.FEATURE_QUALIFIER_INDENT] + == self.FEATURE_QUALIFIER_SPACER + or line.rstrip() == "" + ): # cope with blank lines in the midst of a feature + # Use strip to remove any harmless trailing white space AND and leading + # white space (copes with 21 or 26 indents and orther variants) + assert line[:2] == "FT" + feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT :].strip()) + line = self.handle.readline() + feature_key, location, qualifiers = self.parse_feature( + feature_key, feature_lines + ) + # Try to handle known problems with IMGT locations here: + if ">" in location: + # Nasty hack for common IMGT bug, should be >123 not 123> + # in a location string. At least here the meaning is clear, + # and since it is so common I don't want to issue a warning + # warnings.warn("Feature location %s is invalid, " + # "moving greater than sign before position" + # % location, BiopythonParserWarning) + location = bad_position_re.sub(r">\1", location) + features.append((feature_key, location, qualifiers)) + self.line = line + return features + + +class GenBankScanner(InsdcScanner): + """For extracting chunks of information in GenBank files.""" + + RECORD_START = "LOCUS " + HEADER_WIDTH = 12 + FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"] + FEATURE_END_MARKERS = [] + FEATURE_QUALIFIER_INDENT = 21 + FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT + SEQUENCE_HEADERS = [ + "CONTIG", + "ORIGIN", + "BASE COUNT", + "WGS", + "TSA", + "TLS", + ] # trailing spaces removed + + GENBANK_INDENT = HEADER_WIDTH + GENBANK_SPACER = " " * GENBANK_INDENT + + STRUCTURED_COMMENT_START = "-START##" + STRUCTURED_COMMENT_END = "-END##" + STRUCTURED_COMMENT_DELIM = " :: " + + def parse_footer(self): + """Return a tuple containing a list of any misc strings, and the sequence.""" + if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS: + raise ValueError("Footer format unexpected: '%s'" % self.line) + + misc_lines = [] + while ( + self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS + or self.line[: self.HEADER_WIDTH] == " " * self.HEADER_WIDTH + or "WGS" == self.line[:3] + ): + misc_lines.append(self.line.rstrip()) + self.line = self.handle.readline() + if not self.line: + raise ValueError("Premature end of file") + + if self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: + raise ValueError("Eh? '%s'" % self.line) + + # Now just consume the sequence lines until reach the // marker + # or a CONTIG line + seq_lines = [] + line = self.line + while True: + if not line: + warnings.warn( + "Premature end of file in sequence data", BiopythonParserWarning + ) + line = "//" + break + line = line.rstrip() + if not line: + warnings.warn("Blank line in sequence data", BiopythonParserWarning) + line = self.handle.readline() + continue + if line == "//": + break + if line.startswith("CONTIG"): + break + if len(line) > 9 and line[9:10] != " ": + # Some broken programs indent the sequence by one space too many + # so try to get rid of that and test again. + warnings.warn( + "Invalid indentation for sequence line", BiopythonParserWarning + ) + line = line[1:] + if len(line) > 9 and line[9:10] != " ": + raise ValueError("Sequence line mal-formed, '%s'" % line) + seq_lines.append(line[10:]) # remove spaces later + line = self.handle.readline() + + self.line = line + return misc_lines, "".join(seq_lines).replace(" ", "") + + def _feed_first_line(self, consumer, line): + """Scan over and parse GenBank LOCUS line (PRIVATE). + + This must cope with several variants, primarily the old and new column + based standards from GenBank. Additionally EnsEMBL produces GenBank + files where the LOCUS line is space separated rather that following + the column based layout. + + We also try to cope with GenBank like files with partial LOCUS lines. + + As of release 229.0, the columns are no longer strictly in a given + position. See GenBank format release notes: + + "Historically, the LOCUS line has had a fixed length and its + elements have been presented at specific column positions... + But with the anticipated increases in the lengths of accession + numbers, and the advent of sequences that are gigabases long, + maintaining the column positions will not always be possible and + the overall length of the LOCUS line could exceed 79 characters." + + """ + ##################################### + # LOCUS line # + ##################################### + if line[0 : self.GENBANK_INDENT] != "LOCUS ": + raise ValueError("LOCUS line does not start correctly:\n" + line) + + # Have to break up the locus line, and handle the different bits of it. + # There are at least two different versions of the locus line... + if line[29:33] in [" bp ", " aa ", " rc "] and line[55:62] == " ": + # Old... note we insist on the 55:62 being empty to avoid trying + # to parse space separated LOCUS lines from Ensembl etc, see below. + # + # Positions Contents + # --------- -------- + # 00:06 LOCUS + # 06:12 spaces + # 12:?? Locus name + # ??:?? space + # ??:29 Length of sequence, right-justified + # 29:33 space, bp, space + # 33:41 strand type / molecule type, e.g. DNA + # 41:42 space + # 42:51 Blank (implies linear), linear or circular + # 51:52 space + # 52:55 The division code (e.g. BCT, VRL, INV) + # 55:62 space + # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) + # + # assert line[29:33] in [' bp ', ' aa ',' rc '] , \ + # 'LOCUS line does not contain size units at expected position:\n' + line + if line[41:42] != " ": + raise ValueError( + "LOCUS line does not contain space at position 42:\n" + line + ) + if line[42:51].strip() not in ["", "linear", "circular"]: + raise ValueError( + "LOCUS line does not contain valid entry " + "(linear, circular, ...):\n" + line + ) + if line[51:52] != " ": + raise ValueError( + "LOCUS line does not contain space at position 52:\n" + line + ) + # if line[55:62] != ' ': + # raise ValueError('LOCUS line does not contain spaces from position 56 to 62:\n' + line) + if line[62:73].strip(): + if line[64:65] != "-": + raise ValueError( + "LOCUS line does not contain - at " + "position 65 in date:\n" + line + ) + if line[68:69] != "-": + raise ValueError( + "LOCUS line does not contain - at " + "position 69 in date:\n" + line + ) + + name_and_length_str = line[self.GENBANK_INDENT : 29] + while " " in name_and_length_str: + name_and_length_str = name_and_length_str.replace(" ", " ") + name_and_length = name_and_length_str.split(" ") + if len(name_and_length) > 2: + raise ValueError( + "Cannot parse the name and length in the LOCUS line:\n" + line + ) + if len(name_and_length) == 1: + raise ValueError("Name and length collide in the LOCUS line:\n" + line) + # Should be possible to split them based on position, if + # a clear definition of the standard exists THAT AGREES with + # existing files. + name, length = name_and_length + if len(name) > 16: + # As long as the sequence is short, can steal its leading spaces + # to extend the name over the current 16 character limit. + # However, that deserves a warning as it is out of spec. + warnings.warn( + "GenBank LOCUS line identifier over 16 characters", + BiopythonParserWarning, + ) + consumer.locus(name) + consumer.size(length) + # consumer.residue_type(line[33:41].strip()) + + if line[33:51].strip() == "" and line[29:33] == " aa ": + # Amino acids -> protein (even if there is no residue type given) + consumer.residue_type("PROTEIN") + else: + consumer.residue_type(line[33:51].strip()) + + consumer.molecule_type(line[33:41].strip()) + consumer.topology(line[42:51].strip()) + consumer.data_file_division(line[52:55]) + if line[62:73].strip(): + consumer.date(line[62:73]) + elif line[40:44] in [" bp ", " aa ", " rc "] and line[54:64].strip() in [ + "", + "linear", + "circular", + ]: + # New... linear/circular/big blank test should avoid EnsEMBL style + # LOCUS line being treated like a proper column based LOCUS line. + # + # Positions Contents + # --------- -------- + # 00:06 LOCUS + # 06:12 spaces + # 12:?? Locus name + # ??:?? space + # ??:40 Length of sequence, right-justified + # 40:44 space, bp, space + # 44:47 Blank, ss-, ds-, ms- + # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA + # 54:55 space + # 55:63 Blank (implies linear), linear or circular + # 63:64 space + # 64:67 The division code (e.g. BCT, VRL, INV) + # 67:68 space + # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) + # + if len(line) < 79: + # JBEI genbank files seem to miss a division code and date + # See issue #1656 e.g. + # LOCUS pEH010 5743 bp DNA circular + warnings.warn( + "Truncated LOCUS line found - is this correct?\n:%r" % line, + BiopythonParserWarning, + ) + padding_len = 79 - len(line) + padding = " " * padding_len + line += padding + + if line[40:44] not in [" bp ", " aa ", " rc "]: + raise ValueError( + "LOCUS line does not contain size units at " + "expected position:\n" + line + ) + if line[44:47] not in [" ", "ss-", "ds-", "ms-"]: + raise ValueError( + "LOCUS line does not have valid strand " + "type (Single stranded, ...):\n" + line + ) + + if not ( + line[47:54].strip() == "" + or "DNA" in line[47:54].strip().upper() + or "RNA" in line[47:54].strip().upper() + ): + raise ValueError( + "LOCUS line does not contain valid " + "sequence type (DNA, RNA, ...):\n" + line + ) + if line[54:55] != " ": + raise ValueError( + "LOCUS line does not contain space at position 55:\n" + line + ) + if line[55:63].strip() not in ["", "linear", "circular"]: + raise ValueError( + "LOCUS line does not contain valid " + "entry (linear, circular, ...):\n" + line + ) + if line[63:64] != " ": + raise ValueError( + "LOCUS line does not contain space at position 64:\n" + line + ) + if line[67:68] != " ": + raise ValueError( + "LOCUS line does not contain space at position 68:\n" + line + ) + if line[68:79].strip(): + if line[70:71] != "-": + raise ValueError( + "LOCUS line does not contain - at " + "position 71 in date:\n" + line + ) + if line[74:75] != "-": + raise ValueError( + "LOCUS line does not contain - at " + "position 75 in date:\n" + line + ) + + name_and_length_str = line[self.GENBANK_INDENT : 40] + while " " in name_and_length_str: + name_and_length_str = name_and_length_str.replace(" ", " ") + name_and_length = name_and_length_str.split(" ") + if len(name_and_length) > 2: + raise ValueError( + "Cannot parse the name and length in the LOCUS line:\n" + line + ) + if len(name_and_length) == 1: + raise ValueError("Name and length collide in the LOCUS line:\n" + line) + # Should be possible to split them based on position, if + # a clear definition of the stand exists THAT AGREES with + # existing files. + consumer.locus(name_and_length[0]) + consumer.size(name_and_length[1]) + + if line[44:54].strip() == "" and line[40:44] == " aa ": + # Amino acids -> protein (even if there is no residue type given) + consumer.residue_type(("PROTEIN " + line[54:63]).strip()) + else: + consumer.residue_type(line[44:63].strip()) + + consumer.molecule_type(line[44:54].strip()) + consumer.topology(line[55:63].strip()) + if line[64:76].strip(): + consumer.data_file_division(line[64:67]) + if line[68:79].strip(): + consumer.date(line[68:79]) + elif line[self.GENBANK_INDENT :].strip().count(" ") == 0: + # Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 + # + # e.g. + # + # "LOCUS U00096" + # + # rather than: + # + # "LOCUS U00096 4639675 bp DNA circular BCT" + # + # Positions Contents + # --------- -------- + # 00:06 LOCUS + # 06:12 spaces + # 12:?? Locus name + if line[self.GENBANK_INDENT :].strip() != "": + consumer.locus(line[self.GENBANK_INDENT :].strip()) + else: + # Must just have just "LOCUS ", is this even legitimate? + # We should be able to continue parsing... we need real world testcases! + warnings.warn( + "Minimal LOCUS line found - is this correct?\n:%r" % line, + BiopythonParserWarning, + ) + elif ( + len(line.split()) == 8 + and line.split()[3] in ("aa", "bp") + and line.split()[5] in ("linear", "circular") + ): + # Cope with invalidly spaced GenBank LOCUS lines like + # LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001 + # This will also cope with extra long accession numbers and + # sequence lengths + splitline = line.split() + consumer.locus(splitline[1]) + # Provide descriptive error message if the sequence is too long + # for python to handle + + if int(splitline[2]) > sys.maxsize: + raise ValueError( + "Tried to load a sequence with a length %s, " + "your installation of python can only load " + "sesquences of length %s" % (splitline[2], sys.maxsize) + ) + else: + consumer.size(splitline[2]) + + consumer.residue_type(splitline[4]) + consumer.topology(splitline[5]) + consumer.data_file_division(splitline[6]) + consumer.date(splitline[7]) + if len(line) < 80: + warnings.warn( + "Attempting to parse malformed locus line:\n%r\n" + "Found locus %r size %r residue_type %r\n" + "Some fields may be wrong." + % (line, splitline[1], splitline[2], splitline[4]), + BiopythonParserWarning, + ) + elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]: + # Cope with EnsEMBL genbank files which use space separation rather + # than the expected column based layout. e.g. + # LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 + # LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 + # LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 + # LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 + # Notice that the 'bp' can occur in the position expected by either + # the old or the new fixed column standards (parsed above). + splitline = line.split() + consumer.locus(splitline[1]) + consumer.size(splitline[2]) + consumer.residue_type(splitline[4]) + consumer.data_file_division(splitline[5]) + consumer.date(splitline[6]) + elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]: + # Cope with EMBOSS seqret output where it seems the locus id can cause + # the other fields to overflow. We just IGNORE the other fields! + warnings.warn( + "Malformed LOCUS line found - is this correct?\n:%r" % line, + BiopythonParserWarning, + ) + consumer.locus(line.split()[1]) + consumer.size(line.split()[2]) + elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]: + # Cope with pseudo-GenBank files like this: + # "LOCUS RNA5 complete 1718 bp" + # Treat everything between LOCUS and the size as the identifier. + warnings.warn( + "Malformed LOCUS line found - is this correct?\n:%r" % line, + BiopythonParserWarning, + ) + consumer.locus(line[5:].rsplit(None, 2)[0].strip()) + consumer.size(line.split()[-2]) + else: + raise ValueError("Did not recognise the LOCUS line layout:\n" + line) + + def _feed_header_lines(self, consumer, lines): + # Following dictionary maps GenBank lines to the associated + # consumer methods - the special cases like LOCUS where one + # genbank line triggers several consumer calls have to be + # handled individually. + consumer_dict = { + "DEFINITION": "definition", + "ACCESSION": "accession", + "NID": "nid", + "PID": "pid", + "DBSOURCE": "db_source", + "KEYWORDS": "keywords", + "SEGMENT": "segment", + "SOURCE": "source", + "AUTHORS": "authors", + "CONSRTM": "consrtm", + "PROJECT": "project", + "TITLE": "title", + "JOURNAL": "journal", + "MEDLINE": "medline_id", + "PUBMED": "pubmed_id", + "REMARK": "remark", + } + # We have to handle the following specially: + # ORIGIN (locus, size, residue_type, data_file_division and date) + # COMMENT (comment) + # VERSION (version and gi) + # DBLINK (database links like projects, newlines important) + # REFERENCE (eference_num and reference_bases) + # ORGANISM (organism and taxonomy) + lines = [_f for _f in lines if _f] + lines.append("") # helps avoid getting StopIteration all the time + line_iter = iter(lines) + try: + line = next(line_iter) + while True: + if not line: + break + line_type = line[: self.GENBANK_INDENT].strip() + data = line[self.GENBANK_INDENT :].strip() + + if line_type == "VERSION": + # Need to call consumer.version(), and maybe also consumer.gi() as well. + # e.g. + # VERSION AC007323.5 GI:6587720 + while " " in data: + data = data.replace(" ", " ") + if " GI:" not in data: + consumer.version(data) + else: + if self.debug: + print( + "Version [" + + data.split(" GI:")[0] + + "], gi [" + + data.split(" GI:")[1] + + "]" + ) + consumer.version(data.split(" GI:")[0]) + consumer.gi(data.split(" GI:")[1]) + # Read in the next line! + line = next(line_iter) + elif line_type == "DBLINK": + # Need to call consumer.dblink() for each line, e.g. + # DBLINK Project: 57779 + # BioProject: PRJNA57779 + consumer.dblink(data.strip()) + # Read in the next line, and see if its more of the DBLINK section: + while True: + line = next(line_iter) + if line[: self.GENBANK_INDENT] == self.GENBANK_SPACER: + # Add this continuation to the data string + consumer.dblink(line[self.GENBANK_INDENT :].strip()) + else: + # End of the DBLINK, leave this text in the variable "line" + break + elif line_type == "REFERENCE": + if self.debug > 1: + print("Found reference [" + data + "]") + # Need to call consumer.reference_num() and consumer.reference_bases() + # e.g. + # REFERENCE 1 (bases 1 to 86436) + # + # Note that this can be multiline, see Bug 1968, e.g. + # + # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to + # 28259) + # + # For such cases we will call the consumer once only. + data = data.strip() + + # Read in the next line, and see if its more of the reference: + while True: + line = next(line_iter) + if line[: self.GENBANK_INDENT] == self.GENBANK_SPACER: + # Add this continuation to the data string + data += " " + line[self.GENBANK_INDENT :] + if self.debug > 1: + print("Extended reference text [" + data + "]") + else: + # End of the reference, leave this text in the variable "line" + break + + # We now have all the reference line(s) stored in a string, data, + # which we pass to the consumer + while " " in data: + data = data.replace(" ", " ") + if " " not in data: + if self.debug > 2: + print('Reference number "' + data + '"') + consumer.reference_num(data) + else: + if self.debug > 2: + print( + 'Reference number "' + + data[: data.find(" ")] + + '", "' + + data[data.find(" ") + 1 :] + + '"' + ) + consumer.reference_num(data[: data.find(" ")]) + consumer.reference_bases(data[data.find(" ") + 1 :]) + elif line_type == "ORGANISM": + # Typically the first line is the organism, and subsequent lines + # are the taxonomy lineage. However, given longer and longer + # species names (as more and more strains and sub strains get + # sequenced) the oragnism name can now get wrapped onto multiple + # lines. The NCBI say we have to recognise the lineage line by + # the presence of semi-colon delimited entries. In the long term, + # they are considering adding a new keyword (e.g. LINEAGE). + # See Bug 2591 for details. + organism_data = data + lineage_data = "" + while True: + line = next(line_iter) + if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER: + if lineage_data or ";" in line: + lineage_data += " " + line[self.GENBANK_INDENT :] + elif line[self.GENBANK_INDENT :].strip() == ".": + # No lineage data, just . place holder + pass + else: + organism_data += ( + " " + line[self.GENBANK_INDENT :].strip() + ) + else: + # End of organism and taxonomy + break + consumer.organism(organism_data) + if lineage_data.strip() == "" and self.debug > 1: + print("Taxonomy line(s) missing or blank") + consumer.taxonomy(lineage_data.strip()) + del organism_data, lineage_data + elif line_type == "COMMENT": + # A COMMENT can either be plain text or tabular (Structured Comment), + # or contain both. Multi-line comments are common. The code calls + # consumer.comment() once with a list where each entry + # is a line. If there's a structured comment consumer.structured_comment() + # is called with a dict of dicts where the secondary key/value pairs are + # the same as those in the structured comment table. The primary key is + # the title or header of the table (e.g. Assembly-Data, FluData). See + # http://www.ncbi.nlm.nih.gov/genbank/structuredcomment + # for more information on Structured Comments. + data = line[self.GENBANK_INDENT :] + if self.debug > 1: + print("Found comment") + comment_list = [] + structured_comment_dict = OrderedDict() + regex = fr"([^#]+){self.STRUCTURED_COMMENT_START}$" + structured_comment_key = re.search(regex, data) + if structured_comment_key is not None: + structured_comment_key = structured_comment_key.group(1) + if self.debug > 1: + print("Found Structured Comment") + else: + comment_list.append(data) + + while True: + line = next(line_iter) + data = line[self.GENBANK_INDENT :] + if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER: + if self.STRUCTURED_COMMENT_START in data: + regex = r"([^#]+){}$".format( + self.STRUCTURED_COMMENT_START + ) + structured_comment_key = re.search(regex, data) + if structured_comment_key is not None: + structured_comment_key = structured_comment_key.group( + 1 + ) + else: + comment_list.append(data) + elif ( + structured_comment_key is not None + and self.STRUCTURED_COMMENT_DELIM in data + ): + match = re.search( + r"(.+?)\s*{}\s*(.+)".format( + self.STRUCTURED_COMMENT_DELIM + ), + data, + ) + structured_comment_dict.setdefault( + structured_comment_key, OrderedDict() + ) + structured_comment_dict[structured_comment_key][ + match.group(1) + ] = match.group(2) + if self.debug > 2: + print( + "Structured Comment continuation [" + data + "]" + ) + elif ( + structured_comment_key is not None + and self.STRUCTURED_COMMENT_END not in data + ): + # Don't die on a malformed comment, just warn and carry on + if ( + structured_comment_key + not in structured_comment_dict + ): + warnings.warn( + "Structured comment not parsed for %s. Is it malformed?" + % consumer.data.name, + BiopythonParserWarning, + ) + continue + + # The current structured comment has a multiline value + previous_value_line = structured_comment_dict[ + structured_comment_key + ][match.group(1)] + structured_comment_dict[structured_comment_key][ + match.group(1) + ] = (previous_value_line + " " + line.strip()) + elif self.STRUCTURED_COMMENT_END in data: + # End of structured comment + structured_comment_key = None + else: + comment_list.append(data) + if self.debug > 2: + print("Comment continuation [" + data + "]") + else: + # End of the comment + break + if comment_list: + consumer.comment(comment_list) + if structured_comment_dict: + consumer.structured_comment(structured_comment_dict) + del comment_list, structured_comment_key, structured_comment_dict + elif line_type in consumer_dict: + # It's a semi-automatic entry! + # Now, this may be a multi line entry... + while True: + line = next(line_iter) + if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER: + data += " " + line[self.GENBANK_INDENT :] + else: + # We now have all the data for this entry: + + # The DEFINITION field must ends with a period + # # see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt [3.4.5] + # and discussion https://github.com/biopython/biopython/pull/616 + # We consider this period belong to the syntax, not to the data + # So remove it if it exist + if line_type == "DEFINITION" and data.endswith("."): + data = data[:-1] + getattr(consumer, consumer_dict[line_type])(data) + # End of continuation - return to top of loop! + break + else: + if self.debug: + print("Ignoring GenBank header line:\n" % line) + # Read in next line + line = next(line_iter) + except StopIteration: + raise ValueError("Problem in header") from None + + def _feed_misc_lines(self, consumer, lines): + # Deals with a few misc lines between the features and the sequence + lines.append("") + line_iter = iter(lines) + try: + for line in line_iter: + if line.startswith("BASE COUNT"): + line = line[10:].strip() + if line: + if self.debug: + print("base_count = " + line) + consumer.base_count(line) + if line.startswith("ORIGIN"): + line = line[6:].strip() + if line: + if self.debug: + print("origin_name = " + line) + consumer.origin_name(line) + if line.startswith("TLS "): + line = line[3:].strip() + consumer.tls(line) + if line.startswith("TSA "): + line = line[3:].strip() + consumer.tsa(line) + if line.startswith("WGS "): + line = line[3:].strip() + consumer.wgs(line) + if line.startswith("WGS_SCAFLD"): + line = line[10:].strip() + consumer.add_wgs_scafld(line) + if line.startswith("CONTIG"): + line = line[6:].strip() + contig_location = line + while True: + line = next(line_iter) + if not line: + break + elif line[: self.GENBANK_INDENT] == self.GENBANK_SPACER: + # Don't need to preseve the whitespace here. + contig_location += line[self.GENBANK_INDENT :].rstrip() + elif line.startswith("ORIGIN"): + # Strange, seen this in GenPept files via Entrez gbwithparts + line = line[6:].strip() + if line: + consumer.origin_name(line) + break + else: + raise ValueError( + "Expected CONTIG continuation line, got:\n" + line + ) + consumer.contig_location(contig_location) + return + except StopIteration: + raise ValueError("Problem in misc lines before sequence") from None diff --git a/code/lib/Bio/GenBank/__init__.py b/code/lib/Bio/GenBank/__init__.py new file mode 100644 index 0000000..1875116 --- /dev/null +++ b/code/lib/Bio/GenBank/__init__.py @@ -0,0 +1,1746 @@ +# Copyright 2000 by Jeffrey Chang, Brad Chapman. All rights reserved. +# Copyright 2006-2017 by Peter Cock. All rights reserved. +# +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code to work with GenBank formatted files. + +Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with +the "genbank" or "embl" format names to parse GenBank or EMBL files into +SeqRecord and SeqFeature objects (see the Biopython tutorial for details). + +Using Bio.GenBank directly to parse GenBank files is only useful if you want +to obtain GenBank-specific Record objects, which is a much closer +representation to the raw file contents than the SeqRecord alternative from +the FeatureParser (used in Bio.SeqIO). + +To use the Bio.GenBank parser, there are two helper functions: + + - read Parse a handle containing a single GenBank record + as Bio.GenBank specific Record objects. + - parse Iterate over a handle containing multiple GenBank + records as Bio.GenBank specific Record objects. + +The following internal classes are not intended for direct use and may +be deprecated in a future release. + +Classes: + - Iterator Iterate through a file of GenBank entries + - ErrorFeatureParser Catch errors caused during parsing. + - FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects. + - RecordParser Parse GenBank data into a Record object. + +Exceptions: + - ParserFailureError Exception indicating a failure in the parser (ie. + scanner or consumer) + - LocationParserError Exception indicating a problem with the spark based + location parser. + +""" + +import re +import warnings + +from Bio import BiopythonParserWarning +from Bio.Seq import Seq +from Bio import SeqFeature + +# other Bio.GenBank stuff +from .utils import FeatureValueCleaner +from .Scanner import GenBankScanner + + +# Constants used to parse GenBank header lines +GENBANK_INDENT = 12 +GENBANK_SPACER = " " * GENBANK_INDENT + +# Constants for parsing GenBank feature lines +FEATURE_KEY_INDENT = 5 +FEATURE_QUALIFIER_INDENT = 21 +FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT +FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT + +# Regular expressions for location parsing +_solo_location = r"[<>]?\d+" +_pair_location = r"[<>]?\d+\.\.[<>]?\d+" +_between_location = r"\d+\^\d+" + +_within_position = r"\(\d+\.\d+\)" +_re_within_position = re.compile(_within_position) +_within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % ( + _within_position, + _within_position, +) +assert _re_within_position.match("(3.9)") +assert re.compile(_within_location).match("(3.9)..10") +assert re.compile(_within_location).match("26..(30.33)") +assert re.compile(_within_location).match("(13.19)..(20.28)") + +_oneof_position = r"one\-of\(\d+(,\d+)+\)" +_re_oneof_position = re.compile(_oneof_position) +_oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position) +assert _re_oneof_position.match("one-of(6,9)") +assert re.compile(_oneof_location).match("one-of(6,9)..101") +assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") +assert re.compile(_oneof_location).match("6..one-of(101,104)") + +assert not _re_oneof_position.match("one-of(3)") +assert _re_oneof_position.match("one-of(3,6)") +assert _re_oneof_position.match("one-of(3,6,9)") + + +_simple_location = r"\d+\.\.\d+" +_re_simple_location = re.compile(r"^%s$" % _simple_location) +_re_simple_compound = re.compile( + r"^(join|order|bond)\(%s(,%s)*\)$" % (_simple_location, _simple_location) +) +_complex_location = r"([a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)?(%s|%s|%s|%s|%s)" % ( + _pair_location, + _solo_location, + _between_location, + _within_location, + _oneof_location, +) +_re_complex_location = re.compile(r"^%s$" % _complex_location) +_possibly_complemented_complex_location = r"(%s|complement\(%s\))" % ( + _complex_location, + _complex_location, +) +_re_complex_compound = re.compile( + r"^(join|order|bond)\(%s(,%s)*\)$" + % (_possibly_complemented_complex_location, _possibly_complemented_complex_location) +) + + +assert _re_simple_location.match("104..160") +assert not _re_simple_location.match("68451760..68452073^68452074") +assert not _re_simple_location.match("<104..>160") +assert not _re_simple_location.match("104") +assert not _re_simple_location.match("<1") +assert not _re_simple_location.match(">99999") +assert not _re_simple_location.match("join(104..160,320..390,504..579)") +assert not _re_simple_compound.match("bond(12,63)") +assert _re_simple_compound.match("join(104..160,320..390,504..579)") +assert _re_simple_compound.match("order(1..69,1308..1465)") +assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") +assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") +assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") +assert not _re_simple_compound.match( + "join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)" +) +assert not _re_simple_compound.match("test(1..69,1308..1465)") +assert not _re_simple_compound.match("complement(1..69)") +assert not _re_simple_compound.match("(1..69)") +assert _re_complex_location.match("(3.9)..10") +assert _re_complex_location.match("26..(30.33)") +assert _re_complex_location.match("(13.19)..(20.28)") +assert _re_complex_location.match("41^42") # between +assert _re_complex_location.match("AL121804:41^42") +assert _re_complex_location.match("AL121804:41..610") +assert _re_complex_location.match("AL121804.2:41..610") +assert _re_complex_location.match( + "AL358792.24.1.166931:3274..3461" +) # lots of dots in external reference +assert _re_complex_location.match("one-of(3,6)..101") +assert _re_complex_compound.match( + "join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)" +) +assert not _re_simple_compound.match( + "join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)" +) +assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") +assert _re_complex_compound.match( + "join(complement(AL354868.10.1.164018:80837..81016),complement(AL354868.10.1.164018:80539..80835))" +) + +# Trans-spliced example from NC_016406, note underscore in reference name: +assert _re_complex_location.match("NC_016402.1:6618..6676") +assert _re_complex_location.match("181647..181905") +assert _re_complex_compound.match( + "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" +) +assert not _re_complex_location.match( + "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" +) +assert not _re_simple_compound.match( + "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" +) +assert not _re_complex_location.match( + "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" +) +assert not _re_simple_location.match( + "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" +) + +_solo_bond = re.compile(r"bond\(%s\)" % _solo_location) +assert _solo_bond.match("bond(196)") +assert _solo_bond.search("bond(196)") +assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))") + + +def _pos(pos_str, offset=0): + """Build a Position object (PRIVATE). + + For an end position, leave offset as zero (default): + + >>> _pos("5") + ExactPosition(5) + + For a start position, set offset to minus one (for Python counting): + + >>> _pos("5", -1) + ExactPosition(4) + + This also covers fuzzy positions: + + >>> p = _pos("<5") + >>> p + BeforePosition(5) + >>> print(p) + <5 + >>> int(p) + 5 + + >>> _pos(">5") + AfterPosition(5) + + By default assumes an end position, so note the integer behaviour: + + >>> p = _pos("one-of(5,8,11)") + >>> p + OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) + >>> print(p) + one-of(5,8,11) + >>> int(p) + 11 + + >>> _pos("(8.10)") + WithinPosition(10, left=8, right=10) + + Fuzzy start positions: + + >>> p = _pos("<5", -1) + >>> p + BeforePosition(4) + >>> print(p) + <4 + >>> int(p) + 4 + + Notice how the integer behaviour changes too! + + >>> p = _pos("one-of(5,8,11)", -1) + >>> p + OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) + >>> print(p) + one-of(4,7,10) + >>> int(p) + 4 + + """ + if pos_str.startswith("<"): + return SeqFeature.BeforePosition(int(pos_str[1:]) + offset) + elif pos_str.startswith(">"): + return SeqFeature.AfterPosition(int(pos_str[1:]) + offset) + elif _re_within_position.match(pos_str): + s, e = pos_str[1:-1].split(".") + s = int(s) + offset + e = int(e) + offset + if offset == -1: + default = s + else: + default = e + return SeqFeature.WithinPosition(default, left=s, right=e) + elif _re_oneof_position.match(pos_str): + assert pos_str.startswith("one-of(") + assert pos_str[-1] == ")" + parts = [ + SeqFeature.ExactPosition(int(pos) + offset) + for pos in pos_str[7:-1].split(",") + ] + if offset == -1: + default = min(int(pos) for pos in parts) + else: + default = max(int(pos) for pos in parts) + return SeqFeature.OneOfPosition(default, choices=parts) + else: + return SeqFeature.ExactPosition(int(pos_str) + offset) + + +def _loc(loc_str, expected_seq_length, strand, seq_type=None): + """Make FeatureLocation from non-compound non-complement location (PRIVATE). + + This is also invoked to 'automatically' fix ambiguous formatting of features + that span the origin of a circular sequence. + + Simple examples, + + >>> _loc("123..456", 1000, +1) + FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) + >>> _loc("<123..>456", 1000, strand = -1) + FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) + + A more complex location using within positions, + + >>> _loc("(9.10)..(20.25)", 1000, 1) + FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) + + Notice how that will act as though it has overall start 8 and end 25. + + Zero length between feature, + + >>> _loc("123^124", 1000, 0) + FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) + + The expected sequence length is needed for a special case, a between + position at the start/end of a circular genome: + + >>> _loc("1000^1", 1000, 1) + FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) + + Apart from this special case, between positions P^Q must have P+1==Q, + + >>> _loc("123^456", 1000, 1) + Traceback (most recent call last): + ... + ValueError: Invalid between location '123^456' + + You can optionally provide a reference name: + + >>> _loc("AL391218.9:105173..108462", 2000000, 1) + FeatureLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9') + + >>> _loc("<2644..159", 2868, 1, "circular") + CompoundLocation([FeatureLocation(BeforePosition(2643), ExactPosition(2868), strand=1), FeatureLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join') + """ + if ":" in loc_str: + ref, loc_str = loc_str.split(":") + else: + ref = None + try: + s, e = loc_str.split("..") + except ValueError: + assert ".." not in loc_str + if "^" in loc_str: + # A between location like "67^68" (one based counting) is a + # special case (note it has zero length). In python slice + # notation this is 67:67, a zero length slice. See Bug 2622 + # Further more, on a circular genome of length N you can have + # a location N^1 meaning the junction at the origin. See Bug 3098. + # NOTE - We can imagine between locations like "2^4", but this + # is just "3". Similarly, "2^5" is just "3..4" + s, e = loc_str.split("^") + if int(s) + 1 == int(e): + pos = _pos(s) + elif int(s) == expected_seq_length and e == "1": + pos = _pos(s) + else: + raise ValueError("Invalid between location %r" % loc_str) from None + return SeqFeature.FeatureLocation(pos, pos, strand, ref=ref) + else: + # e.g. "123" + s = loc_str + e = loc_str + + # Attempt to fix features that span the origin + s_pos = _pos(s, -1) + e_pos = _pos(e) + if int(s_pos) > int(e_pos): + if seq_type is None or "circular" not in seq_type.lower(): + warnings.warn( + "It appears that %r is a feature that spans " + "the origin, but the sequence topology is " + "undefined. Skipping feature." % loc_str, + BiopythonParserWarning, + ) + return None + warnings.warn( + "Attempting to fix invalid location %r as " + "it looks like incorrect origin wrapping. " + "Please fix input file, this could have " + "unintended behavior." % loc_str, + BiopythonParserWarning, + ) + + f1 = SeqFeature.FeatureLocation(s_pos, expected_seq_length, strand) + f2 = SeqFeature.FeatureLocation(0, int(e_pos), strand) + + if strand == -1: + # For complementary features spanning the origin + return f2 + f1 + else: + return f1 + f2 + + return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand, ref=ref) + + +def _split_compound_loc(compound_loc): + """Split a tricky compound location string (PRIVATE). + + >>> list(_split_compound_loc("123..145")) + ['123..145'] + >>> list(_split_compound_loc("123..145,200..209")) + ['123..145', '200..209'] + >>> list(_split_compound_loc("one-of(200,203)..300")) + ['one-of(200,203)..300'] + >>> list(_split_compound_loc("complement(123..145),200..209")) + ['complement(123..145)', '200..209'] + >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) + ['123..145', 'one-of(200,203)..209'] + >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) + ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] + >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) + ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] + >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) + ['123..145', '200..one-of(209,211)', '300'] + >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) + ['123..145', '200..one-of(209,211)'] + >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) + ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] + """ + if "one-of(" in compound_loc: + # Hard case + while "," in compound_loc: + assert compound_loc[0] != "," + assert compound_loc[0:2] != ".." + i = compound_loc.find(",") + part = compound_loc[:i] + compound_loc = compound_loc[i:] # includes the comma + while part.count("(") > part.count(")"): + assert "one-of(" in part, (part, compound_loc) + i = compound_loc.find(")") + part += compound_loc[: i + 1] + compound_loc = compound_loc[i + 1 :] + if compound_loc.startswith(".."): + i = compound_loc.find(",") + if i == -1: + part += compound_loc + compound_loc = "" + else: + part += compound_loc[:i] + compound_loc = compound_loc[i:] # includes the comma + while part.count("(") > part.count(")"): + assert part.count("one-of(") == 2 + i = compound_loc.find(")") + part += compound_loc[: i + 1] + compound_loc = compound_loc[i + 1 :] + if compound_loc.startswith(","): + compound_loc = compound_loc[1:] + assert part + yield part + if compound_loc: + yield compound_loc + else: + # Easy case + yield from compound_loc.split(",") + + +class Iterator: + """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). + + This class is likely to be deprecated in a future release of Biopython. + Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) + for SeqRecord and GenBank specific Record objects respectively instead. + """ + + def __init__(self, handle, parser=None): + """Initialize the iterator. + + Arguments: + - handle - A handle with GenBank entries to iterate through. + - parser - An optional parser to pass the entries through before + returning them. If None, then the raw entry will be returned. + + """ + self.handle = handle + self._parser = parser + + def __next__(self): + """Return the next GenBank record from the handle. + + Will return None if we ran out of records. + """ + if self._parser is None: + lines = [] + while True: + line = self.handle.readline() + if not line: + return None # Premature end of file? + lines.append(line) + if line.rstrip() == "//": + break + return "".join(lines) + try: + return self._parser.parse(self.handle) + except StopIteration: + return None + + def __iter__(self): + """Iterate over the records.""" + return iter(self.__next__, None) + + +class ParserFailureError(Exception): + """Failure caused by some kind of problem in the parser.""" + + pass + + +class LocationParserError(Exception): + """Could not Properly parse out a location from a GenBank file.""" + + pass + + +_cleaner = FeatureValueCleaner() + + +class FeatureParser: + """Parse GenBank files into Seq + Feature objects (OBSOLETE). + + Direct use of this class is discouraged, and may be deprecated in + a future release of Biopython. + + Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. + """ + + def __init__(self, debug_level=0, use_fuzziness=1, feature_cleaner=None): + """Initialize a GenBank parser and Feature consumer. + + Arguments: + - debug_level - An optional argument that species the amount of + debugging information the parser should spit out. By default we have + no debugging info (the fastest way to do things), but if you want + you can set this as high as two and see exactly where a parse fails. + - use_fuzziness - Specify whether or not to use fuzzy representations. + The default is 1 (use fuzziness). + - feature_cleaner - A class which will be used to clean out the + values of features. This class must implement the function + clean_value. GenBank.utils has a "standard" cleaner class, which + is used by default. + + """ + self._scanner = GenBankScanner(debug_level) + self.use_fuzziness = use_fuzziness + if feature_cleaner: + self._cleaner = feature_cleaner + else: + self._cleaner = _cleaner # default + + def parse(self, handle): + """Parse the specified handle.""" + _consumer = _FeatureConsumer(self.use_fuzziness, self._cleaner) + self._scanner.feed(handle, _consumer) + return _consumer.data + + +class RecordParser: + """Parse GenBank files into Record objects (OBSOLETE). + + Direct use of this class is discouraged, and may be deprecated in + a future release of Biopython. + + Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions + instead. + """ + + def __init__(self, debug_level=0): + """Initialize the parser. + + Arguments: + - debug_level - An optional argument that species the amount of + debugging information the parser should spit out. By default we have + no debugging info (the fastest way to do things), but if you want + you can set this as high as two and see exactly where a parse fails. + + """ + self._scanner = GenBankScanner(debug_level) + + def parse(self, handle): + """Parse the specified handle into a GenBank record.""" + _consumer = _RecordConsumer() + + self._scanner.feed(handle, _consumer) + return _consumer.data + + +class _BaseGenBankConsumer: + """Abstract GenBank consumer providing useful general functions (PRIVATE). + + This just helps to eliminate some duplication in things that most + GenBank consumers want to do. + """ + + # Special keys in GenBank records that we should remove spaces from + # For instance, \translation keys have values which are proteins and + # should have spaces and newlines removed from them. This class + # attribute gives us more control over specific formatting problems. + remove_space_keys = ["translation"] + + def __init__(self): + pass + + @staticmethod + def _split_keywords(keyword_string): + """Split a string of keywords into a nice clean list (PRIVATE).""" + # process the keywords into a python list + if keyword_string == "" or keyword_string == ".": + keywords = "" + elif keyword_string[-1] == ".": + keywords = keyword_string[:-1] + else: + keywords = keyword_string + keyword_list = keywords.split(";") + return [x.strip() for x in keyword_list] + + @staticmethod + def _split_accessions(accession_string): + """Split a string of accession numbers into a list (PRIVATE).""" + # first replace all line feeds with spaces + # Also, EMBL style accessions are split with ';' + accession = accession_string.replace("\n", " ").replace(";", " ") + + return [x.strip() for x in accession.split() if x.strip()] + + @staticmethod + def _split_taxonomy(taxonomy_string): + """Split a string with taxonomy info into a list (PRIVATE).""" + if not taxonomy_string or taxonomy_string == ".": + # Missing data, no taxonomy + return [] + + if taxonomy_string[-1] == ".": + tax_info = taxonomy_string[:-1] + else: + tax_info = taxonomy_string + tax_list = tax_info.split(";") + new_tax_list = [] + for tax_item in tax_list: + new_items = tax_item.split("\n") + new_tax_list.extend(new_items) + while "" in new_tax_list: + new_tax_list.remove("") + return [x.strip() for x in new_tax_list] + + @staticmethod + def _clean_location(location_string): + """Clean whitespace out of a location string (PRIVATE). + + The location parser isn't a fan of whitespace, so we clean it out + before feeding it into the parser. + """ + # Originally this imported string.whitespace and did a replace + # via a loop. It's simpler to just split on whitespace and rejoin + # the string - and this avoids importing string too. See Bug 2684. + return "".join(location_string.split()) + + @staticmethod + def _remove_newlines(text): + """Remove any newlines in the passed text, returning the new string (PRIVATE).""" + # get rid of newlines in the qualifier value + newlines = ["\n", "\r"] + for ws in newlines: + text = text.replace(ws, "") + + return text + + @staticmethod + def _normalize_spaces(text): + """Replace multiple spaces in the passed text with single spaces (PRIVATE).""" + # get rid of excessive spaces + return " ".join(x for x in text.split(" ") if x) + + @staticmethod + def _remove_spaces(text): + """Remove all spaces from the passed text (PRIVATE).""" + return text.replace(" ", "") + + @staticmethod + def _convert_to_python_numbers(start, end): + """Convert a start and end range to python notation (PRIVATE). + + In GenBank, starts and ends are defined in "biological" coordinates, + where 1 is the first base and [i, j] means to include both i and j. + + In python, 0 is the first base and [i, j] means to include i, but + not j. + + So, to convert "biological" to python coordinates, we need to + subtract 1 from the start, and leave the end and things should + be converted happily. + """ + new_start = start - 1 + new_end = end + + return new_start, new_end + + +class _FeatureConsumer(_BaseGenBankConsumer): + """Create a SeqRecord object with Features to return (PRIVATE). + + Attributes: + - use_fuzziness - specify whether or not to parse with fuzziness in + feature locations. + - feature_cleaner - a class that will be used to provide specialized + cleaning-up of feature values. + + """ + + def __init__(self, use_fuzziness, feature_cleaner=None): + from Bio.SeqRecord import SeqRecord + + _BaseGenBankConsumer.__init__(self) + self.data = SeqRecord(None, id=None) + self.data.id = None + self.data.description = "" + + self._use_fuzziness = use_fuzziness + self._feature_cleaner = feature_cleaner + + self._seq_type = "" + self._seq_data = [] + self._cur_reference = None + self._cur_feature = None + self._expected_size = None + + def locus(self, locus_name): + """Set the locus name is set as the name of the Sequence.""" + self.data.name = locus_name + + def size(self, content): + """Record the sequence length.""" + self._expected_size = int(content) + + def residue_type(self, type): + """Record the sequence type (SEMI-OBSOLETE). + + This reflects the fact that the topology (linear/circular) and + molecule type (e.g. DNA vs RNA) were a single field in early + files. Current GenBank/EMBL files have two fields. + """ + self._seq_type = type.strip() + + def topology(self, topology): + """Validate and record sequence topology. + + The topology argument should be "linear" or "circular" (string). + """ + if topology: + if topology not in ["linear", "circular"]: + raise ParserFailureError( + "Unexpected topology %r should be linear or circular" % topology + ) + self.data.annotations["topology"] = topology + + def molecule_type(self, mol_type): + """Validate and record the molecule type (for round-trip etc).""" + if mol_type: + if "circular" in mol_type or "linear" in mol_type: + raise ParserFailureError( + "Molecule type %r should not include topology" % mol_type + ) + + # Writing out records will fail if we have a lower case DNA + # or RNA string in here, so upper case it. + # This is a bit ugly, but we don't want to upper case e.g. + # the m in mRNA, but thanks to the strip we lost the spaces + # so we need to index from the back + if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper(): + warnings.warn( + "Non-upper case molecule type in LOCUS line: %s" % mol_type, + BiopythonParserWarning, + ) + + self.data.annotations["molecule_type"] = mol_type + + def data_file_division(self, division): + self.data.annotations["data_file_division"] = division + + def date(self, submit_date): + self.data.annotations["date"] = submit_date + + def definition(self, definition): + """Set the definition as the description of the sequence.""" + if self.data.description: + # Append to any existing description + # e.g. EMBL files with two DE lines. + self.data.description += " " + definition + else: + self.data.description = definition + + def accession(self, acc_num): + """Set the accession number as the id of the sequence. + + If we have multiple accession numbers, the first one passed is + used. + """ + new_acc_nums = self._split_accessions(acc_num) + + # Also record them ALL in the annotations + try: + # On the off chance there was more than one accession line: + for acc in new_acc_nums: + # Prevent repeat entries + if acc not in self.data.annotations["accessions"]: + self.data.annotations["accessions"].append(acc) + except KeyError: + self.data.annotations["accessions"] = new_acc_nums + + # if we haven't set the id information yet, add the first acc num + if not self.data.id: + if len(new_acc_nums) > 0: + # self.data.id = new_acc_nums[0] + # Use the FIRST accession as the ID, not the first on this line! + self.data.id = self.data.annotations["accessions"][0] + + def tls(self, content): + self.data.annotations["tls"] = content.split("-") + + def tsa(self, content): + self.data.annotations["tsa"] = content.split("-") + + def wgs(self, content): + self.data.annotations["wgs"] = content.split("-") + + def add_wgs_scafld(self, content): + self.data.annotations.setdefault("wgs_scafld", []).append(content.split("-")) + + def nid(self, content): + self.data.annotations["nid"] = content + + def pid(self, content): + self.data.annotations["pid"] = content + + def version(self, version_id): + # Want to use the versioned accession as the record.id + # This comes from the VERSION line in GenBank files, or the + # obsolete SV line in EMBL. For the new EMBL files we need + # both the version suffix from the ID line and the accession + # from the AC line. + if version_id.count(".") == 1 and version_id.split(".")[1].isdigit(): + self.accession(version_id.split(".")[0]) + self.version_suffix(version_id.split(".")[1]) + elif version_id: + # For backwards compatibility... + self.data.id = version_id + + def project(self, content): + """Handle the information from the PROJECT line as a list of projects. + + e.g.:: + + PROJECT GenomeProject:28471 + + or:: + + PROJECT GenomeProject:13543 GenomeProject:99999 + + This is stored as dbxrefs in the SeqRecord to be consistent with the + projected switch of this line to DBLINK in future GenBank versions. + Note the NCBI plan to replace "GenomeProject:28471" with the shorter + "Project:28471" as part of this transition. + """ + content = content.replace("GenomeProject:", "Project:") + self.data.dbxrefs.extend(p for p in content.split() if p) + + def dblink(self, content): + """Store DBLINK cross references as dbxrefs in our record object. + + This line type is expected to replace the PROJECT line in 2009. e.g. + + During transition:: + + PROJECT GenomeProject:28471 + DBLINK Project:28471 + Trace Assembly Archive:123456 + + Once the project line is dropped:: + + DBLINK Project:28471 + Trace Assembly Archive:123456 + + Note GenomeProject -> Project. + + We'll have to see some real examples to be sure, but based on the + above example we can expect one reference per line. + + Note that at some point the NCBI have included an extra space, e.g.:: + + DBLINK Project: 28471 + + """ + # During the transition period with both PROJECT and DBLINK lines, + # we don't want to add the same cross reference twice. + while ": " in content: + content = content.replace(": ", ":") + if content.strip() not in self.data.dbxrefs: + self.data.dbxrefs.append(content.strip()) + + def version_suffix(self, version): + """Set the version to overwrite the id. + + Since the version provides the same information as the accession + number, plus some extra info, we set this as the id if we have + a version. + """ + # e.g. GenBank line: + # VERSION U49845.1 GI:1293613 + # or the obsolete EMBL line: + # SV U49845.1 + # Scanner calls consumer.version("U49845.1") + # which then calls consumer.version_suffix(1) + # + # e.g. EMBL new line: + # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. + # Scanner calls consumer.version_suffix(1) + assert version.isdigit() + self.data.annotations["sequence_version"] = int(version) + + def db_source(self, content): + self.data.annotations["db_source"] = content.rstrip() + + def gi(self, content): + self.data.annotations["gi"] = content + + def keywords(self, content): + if "keywords" in self.data.annotations: + # Multi-line keywords, append to list + # Note EMBL states "A keyword is never split between lines." + self.data.annotations["keywords"].extend(self._split_keywords(content)) + else: + self.data.annotations["keywords"] = self._split_keywords(content) + + def segment(self, content): + self.data.annotations["segment"] = content + + def source(self, content): + # Note that some software (e.g. VectorNTI) may produce an empty + # source (rather than using a dot/period as might be expected). + if content == "": + source_info = "" + elif content[-1] == ".": + source_info = content[:-1] + else: + source_info = content + self.data.annotations["source"] = source_info + + def organism(self, content): + self.data.annotations["organism"] = content + + def taxonomy(self, content): + """Record (another line of) the taxonomy lineage.""" + lineage = self._split_taxonomy(content) + try: + self.data.annotations["taxonomy"].extend(lineage) + except KeyError: + self.data.annotations["taxonomy"] = lineage + + def reference_num(self, content): + """Signal the beginning of a new reference object.""" + # if we have a current reference that hasn't been added to + # the list of references, add it. + if self._cur_reference is not None: + self.data.annotations["references"].append(self._cur_reference) + else: + self.data.annotations["references"] = [] + + self._cur_reference = SeqFeature.Reference() + + def reference_bases(self, content): + """Attempt to determine the sequence region the reference entails. + + Possible types of information we may have to deal with: + + (bases 1 to 86436) + (sites) + (bases 1 to 105654; 110423 to 111122) + 1 (residues 1 to 182) + """ + # first remove the parentheses + assert content.endswith(")"), content + ref_base_info = content[1:-1] + + all_locations = [] + # parse if we've got 'bases' and 'to' + if "bases" in ref_base_info and "to" in ref_base_info: + # get rid of the beginning 'bases' + ref_base_info = ref_base_info[5:] + locations = self._split_reference_locations(ref_base_info) + all_locations.extend(locations) + elif "residues" in ref_base_info and "to" in ref_base_info: + residues_start = ref_base_info.find("residues") + # get only the information after "residues" + ref_base_info = ref_base_info[(residues_start + len("residues ")) :] + locations = self._split_reference_locations(ref_base_info) + all_locations.extend(locations) + + # make sure if we are not finding information then we have + # the string 'sites' or the string 'bases' + elif ref_base_info == "sites" or ref_base_info.strip() == "bases": + pass + # otherwise raise an error + else: + raise ValueError( + "Could not parse base info %s in record %s" + % (ref_base_info, self.data.id) + ) + + self._cur_reference.location = all_locations + + def _split_reference_locations(self, location_string): + """Get reference locations out of a string of reference information (PRIVATE). + + The passed string should be of the form:: + + 1 to 20; 20 to 100 + + This splits the information out and returns a list of location objects + based on the reference locations. + """ + # split possibly multiple locations using the ';' + all_base_info = location_string.split(";") + + new_locations = [] + for base_info in all_base_info: + start, end = base_info.split("to") + new_start, new_end = self._convert_to_python_numbers( + int(start.strip()), int(end.strip()) + ) + this_location = SeqFeature.FeatureLocation(new_start, new_end) + new_locations.append(this_location) + return new_locations + + def authors(self, content): + if self._cur_reference.authors: + self._cur_reference.authors += " " + content + else: + self._cur_reference.authors = content + + def consrtm(self, content): + if self._cur_reference.consrtm: + self._cur_reference.consrtm += " " + content + else: + self._cur_reference.consrtm = content + + def title(self, content): + if self._cur_reference is None: + warnings.warn( + "GenBank TITLE line without REFERENCE line.", BiopythonParserWarning + ) + elif self._cur_reference.title: + self._cur_reference.title += " " + content + else: + self._cur_reference.title = content + + def journal(self, content): + if self._cur_reference.journal: + self._cur_reference.journal += " " + content + else: + self._cur_reference.journal = content + + def medline_id(self, content): + self._cur_reference.medline_id = content + + def pubmed_id(self, content): + self._cur_reference.pubmed_id = content + + def remark(self, content): + """Deal with a reference comment.""" + if self._cur_reference.comment: + self._cur_reference.comment += " " + content + else: + self._cur_reference.comment = content + + def comment(self, content): + try: + self.data.annotations["comment"] += "\n" + "\n".join(content) + except KeyError: + self.data.annotations["comment"] = "\n".join(content) + + def structured_comment(self, content): + self.data.annotations["structured_comment"] = content + + def features_line(self, content): + """Get ready for the feature table when we reach the FEATURE line.""" + self.start_feature_table() + + def start_feature_table(self): + """Indicate we've got to the start of the feature table.""" + # make sure we've added on our last reference object + if self._cur_reference is not None: + self.data.annotations["references"].append(self._cur_reference) + self._cur_reference = None + + def feature_key(self, content): + # start a new feature + self._cur_feature = SeqFeature.SeqFeature() + self._cur_feature.type = content + self.data.features.append(self._cur_feature) + + def location(self, content): + """Parse out location information from the location string. + + This uses simple Python code with some regular expressions to do the + parsing, and then translates the results into appropriate objects. + """ + # clean up newlines and other whitespace inside the location before + # parsing - locations should have no whitespace whatsoever + location_line = self._clean_location(content) + + # Older records have junk like replace(266,"c") in the + # location line. Newer records just replace this with + # the number 266 and have the information in a more reasonable + # place. So we'll just grab out the number and feed this to the + # parser. We shouldn't really be losing any info this way. + if "replace" in location_line: + comma_pos = location_line.find(",") + location_line = location_line[8:comma_pos] + + cur_feature = self._cur_feature + + # Handle top level complement here for speed + if location_line.startswith("complement("): + assert location_line.endswith(")") + location_line = location_line[11:-1] + strand = -1 + elif "PROTEIN" in self._seq_type.upper(): + strand = None + else: + # Assume nucleotide otherwise feature strand for + # GenBank files with bad LOCUS lines set to None + strand = 1 + + # Special case handling of the most common cases for speed + if _re_simple_location.match(location_line): + # e.g. "123..456" + s, e = location_line.split("..") + try: + cur_feature.location = SeqFeature.FeatureLocation( + int(s) - 1, int(e), strand + ) + except ValueError: + # Could be non-integers, more likely bad origin wrapping + cur_feature.location = _loc( + location_line, + self._expected_size, + strand, + seq_type=self._seq_type.lower(), + ) + return + + if ",)" in location_line: + warnings.warn( + "Dropping trailing comma in malformed feature location", + BiopythonParserWarning, + ) + location_line = location_line.replace(",)", ")") + + if _solo_bond.search(location_line): + # e.g. bond(196) + # e.g. join(bond(284),bond(305),bond(309),bond(305)) + warnings.warn( + "Dropping bond qualifier in feature location", BiopythonParserWarning + ) + # There ought to be a better way to do this... + for x in _solo_bond.finditer(location_line): + x = x.group() + location_line = location_line.replace(x, x[5:-1]) + + if _re_simple_compound.match(location_line): + # e.g. join(<123..456,480..>500) + i = location_line.find("(") + # cur_feature.location_operator = location_line[:i] + # we can split on the comma because these are simple locations + locs = [] + for part in location_line[i + 1 : -1].split(","): + s, e = part.split("..") + + try: + locs.append(SeqFeature.FeatureLocation(int(s) - 1, int(e), strand)) + except ValueError: + # Could be non-integers, more likely bad origin wrapping + + # In the case of bad origin wrapping, _loc will return + # a CompoundLocation. CompoundLocation.parts returns a + # list of the FeatureLocation objects inside the + # CompoundLocation. + locs.extend( + _loc( + part, self._expected_size, strand, self._seq_type.lower() + ).parts + ) + + if len(locs) < 2: + # The CompoundLocation will raise a ValueError here! + warnings.warn( + "Should have at least 2 parts for compound location", + BiopythonParserWarning, + ) + cur_feature.location = None + return + if strand == -1: + cur_feature.location = SeqFeature.CompoundLocation( + locs[::-1], operator=location_line[:i] + ) + else: + cur_feature.location = SeqFeature.CompoundLocation( + locs, operator=location_line[:i] + ) + return + + # Handle the general case with more complex regular expressions + if _re_complex_location.match(location_line): + # e.g. "AL121804.2:41..610" + cur_feature.location = _loc( + location_line, + self._expected_size, + strand, + seq_type=self._seq_type.lower(), + ) + return + + if _re_complex_compound.match(location_line): + i = location_line.find("(") + # cur_feature.location_operator = location_line[:i] + # Can't split on the comma because of positions like one-of(1,2,3) + locs = [] + for part in _split_compound_loc(location_line[i + 1 : -1]): + if part.startswith("complement("): + assert part[-1] == ")" + part = part[11:-1] + assert strand != -1, "Double complement?" + part_strand = -1 + else: + part_strand = strand + try: + # There is likely a problem with origin wrapping. + # Using _loc to return a CompoundLocation of the + # wrapped feature and returning the two FeatureLocation + # objects to extend to the list of feature locations. + loc = _loc( + part, + self._expected_size, + part_strand, + seq_type=self._seq_type.lower(), + ).parts + + except ValueError: + print(location_line) + print(part) + raise + # loc will be a list of one or two FeatureLocation items. + locs.extend(loc) + # Historically a join on the reverse strand has been represented + # in Biopython with both the parent SeqFeature and its children + # (the exons for a CDS) all given a strand of -1. Likewise, for + # a join feature on the forward strand they all have strand +1. + # However, we must also consider evil mixed strand examples like + # this, join(complement(69611..69724),139856..140087,140625..140650) + if strand == -1: + # Whole thing was wrapped in complement(...) + for l in locs: + assert l.strand == -1 + # Reverse the backwards order used in GenBank files + # with complement(join(...)) + cur_feature.location = SeqFeature.CompoundLocation( + locs[::-1], operator=location_line[:i] + ) + else: + cur_feature.location = SeqFeature.CompoundLocation( + locs, operator=location_line[:i] + ) + return + # Not recognised + if "order" in location_line and "join" in location_line: + # See Bug 3197 + msg = ( + 'Combinations of "join" and "order" within the same ' + "location (nested operators) are illegal:\n" + location_line + ) + raise LocationParserError(msg) + # This used to be an error.... + cur_feature.location = None + warnings.warn( + BiopythonParserWarning( + "Couldn't parse feature location: %r" % location_line + ) + ) + + def feature_qualifier(self, key, value): + """When we get a qualifier key and its value. + + Can receive None, since you can have valueless keys such as /pseudo + """ + # Hack to try to preserve historical behaviour of /pseudo etc + if value is None: + # if the key doesn't exist yet, add an empty string + if key not in self._cur_feature.qualifiers: + self._cur_feature.qualifiers[key] = [""] + return + # otherwise just skip this key + return + + # Remove enclosing quotation marks + value = re.sub('^"|"$', "", value) + + # Handle NCBI escaping + # Warn if escaping is not according to standard + if re.search(r'[^"]"[^"]|^"[^"]|[^"]"$', value): + warnings.warn( + 'The NCBI states double-quote characters like " should be escaped as "" ' + "(two double - quotes), but here it was not: %r" % value, + BiopythonParserWarning, + ) + # Undo escaping, repeated double quotes -> one double quote + value = value.replace('""', '"') + + if self._feature_cleaner is not None: + value = self._feature_cleaner.clean_value(key, value) + + # if the qualifier name exists, append the value + if key in self._cur_feature.qualifiers: + self._cur_feature.qualifiers[key].append(value) + # otherwise start a new list of the key with its values + else: + self._cur_feature.qualifiers[key] = [value] + + def feature_qualifier_name(self, content_list): + """Use feature_qualifier instead (OBSOLETE).""" + raise NotImplementedError("Use the feature_qualifier method instead.") + + def feature_qualifier_description(self, content): + """Use feature_qualifier instead (OBSOLETE).""" + raise NotImplementedError("Use the feature_qualifier method instead.") + + def contig_location(self, content): + """Deal with CONTIG information.""" + # Historically this was stored as a SeqFeature object, but it was + # stored under record.annotations["contig"] and not under + # record.features with the other SeqFeature objects. + # + # The CONTIG location line can include additional tokens like + # Gap(), Gap(100) or Gap(unk100) which are not used in the feature + # location lines, so storing it using SeqFeature based location + # objects is difficult. + # + # We now store this a string, which means for BioSQL we are now in + # much better agreement with how BioPerl records the CONTIG line + # in the database. + # + # NOTE - This code assumes the scanner will return all the CONTIG + # lines already combined into one long string! + self.data.annotations["contig"] = content + + def origin_name(self, content): + pass + + def base_count(self, content): + pass + + def base_number(self, content): + pass + + def sequence(self, content): + """Add up sequence information as we get it. + + To try and make things speedier, this puts all of the strings + into a list of strings, and then uses string.join later to put + them together. Supposedly, this is a big time savings + """ + assert " " not in content + self._seq_data.append(content.upper()) + + def record_end(self, content): + """Clean up when we've finished the record.""" + # Try and append the version number to the accession for the full id + if not self.data.id: + if "accessions" in self.data.annotations: + raise ValueError( + "Problem adding version number to accession: " + + str(self.data.annotations["accessions"]) + ) + self.data.id = self.data.name # Good fall back? + elif self.data.id.count(".") == 0: + try: + self.data.id += ".%i" % self.data.annotations["sequence_version"] + except KeyError: + pass + + # add the sequence information + + sequence = "".join(self._seq_data) + + if ( + self._expected_size is not None + and len(sequence) != 0 + and self._expected_size != len(sequence) + ): + warnings.warn( + "Expected sequence length %i, found %i (%s)." + % (self._expected_size, len(sequence), self.data.id), + BiopythonParserWarning, + ) + + molecule_type = None + if self._seq_type: + # mRNA is really also DNA, since it is actually cDNA + if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): + molecule_type = "DNA" + # are there ever really RNA sequences in GenBank? + elif "RNA" in self._seq_type.upper(): + # Even for data which was from RNA, the sequence string + # is usually given as DNA (T not U). Bug 3010 + molecule_type = "RNA" + elif ( + "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" + ): # PRT is used in EMBL-bank for patents + molecule_type = "protein" + # work around ugly GenBank records which have circular or + # linear but no indication of sequence type + elif self._seq_type in ["circular", "linear", "unspecified"]: + pass + # we have a bug if we get here + else: + raise ValueError( + "Could not determine molecule_type for seq_type %s" % self._seq_type + ) + # Don't overwrite molecule_type + if molecule_type is not None: + self.data.annotations["molecule_type"] = self.data.annotations.get( + "molecule_type", molecule_type + ) + if not sequence and self._expected_size: + self.data.seq = Seq(None, length=self._expected_size) + else: + self.data.seq = Seq(sequence) + + +class _RecordConsumer(_BaseGenBankConsumer): + """Create a GenBank Record object from scanner generated information (PRIVATE).""" + + def __init__(self): + _BaseGenBankConsumer.__init__(self) + from . import Record + + self.data = Record.Record() + + self._seq_data = [] + self._cur_reference = None + self._cur_feature = None + self._cur_qualifier = None + + def tls(self, content): + self.data.tls = content.split("-") + + def tsa(self, content): + self.data.tsa = content.split("-") + + def wgs(self, content): + self.data.wgs = content.split("-") + + def add_wgs_scafld(self, content): + self.data.wgs_scafld.append(content.split("-")) + + def locus(self, content): + self.data.locus = content + + def size(self, content): + self.data.size = content + + def residue_type(self, content): + # Be lenient about parsing, but technically lowercase residue types are malformed. + if "dna" in content or "rna" in content: + warnings.warn( + "Invalid seq_type (%s): DNA/RNA should be uppercase." % content, + BiopythonParserWarning, + ) + self.data.residue_type = content + + def data_file_division(self, content): + self.data.data_file_division = content + + def date(self, content): + self.data.date = content + + def definition(self, content): + self.data.definition = content + + def accession(self, content): + for acc in self._split_accessions(content): + if acc not in self.data.accession: + self.data.accession.append(acc) + + def molecule_type(self, mol_type): + """Validate and record the molecule type (for round-trip etc).""" + if mol_type: + if "circular" in mol_type or "linear" in mol_type: + raise ParserFailureError( + "Molecule type %r should not include topology" % mol_type + ) + + # Writing out records will fail if we have a lower case DNA + # or RNA string in here, so upper case it. + # This is a bit ugly, but we don't want to upper case e.g. + # the m in mRNA, but thanks to the strip we lost the spaces + # so we need to index from the back + if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper(): + warnings.warn( + "Non-upper case molecule type in LOCUS line: %s" % mol_type, + BiopythonParserWarning, + ) + + self.data.molecule_type = mol_type + + def topology(self, topology): + """Validate and record sequence topology. + + The topology argument should be "linear" or "circular" (string). + """ + if topology: + if topology not in ["linear", "circular"]: + raise ParserFailureError( + "Unexpected topology %r should be linear or circular" % topology + ) + self.data.topology = topology + + def nid(self, content): + self.data.nid = content + + def pid(self, content): + self.data.pid = content + + def version(self, content): + self.data.version = content + + def db_source(self, content): + self.data.db_source = content.rstrip() + + def gi(self, content): + self.data.gi = content + + def keywords(self, content): + self.data.keywords = self._split_keywords(content) + + def project(self, content): + self.data.projects.extend(p for p in content.split() if p) + + def dblink(self, content): + self.data.dblinks.append(content) + + def segment(self, content): + self.data.segment = content + + def source(self, content): + self.data.source = content + + def organism(self, content): + self.data.organism = content + + def taxonomy(self, content): + self.data.taxonomy = self._split_taxonomy(content) + + def reference_num(self, content): + """Grab the reference number and signal the start of a new reference.""" + # check if we have a reference to add + if self._cur_reference is not None: + self.data.references.append(self._cur_reference) + + from . import Record + + self._cur_reference = Record.Reference() + self._cur_reference.number = content + + def reference_bases(self, content): + self._cur_reference.bases = content + + def authors(self, content): + self._cur_reference.authors = content + + def consrtm(self, content): + self._cur_reference.consrtm = content + + def title(self, content): + if self._cur_reference is None: + warnings.warn( + "GenBank TITLE line without REFERENCE line.", BiopythonParserWarning + ) + return + self._cur_reference.title = content + + def journal(self, content): + self._cur_reference.journal = content + + def medline_id(self, content): + self._cur_reference.medline_id = content + + def pubmed_id(self, content): + self._cur_reference.pubmed_id = content + + def remark(self, content): + self._cur_reference.remark = content + + def comment(self, content): + self.data.comment += "\n".join(content) + + def structured_comment(self, content): + self.data.structured_comment = content + + def primary_ref_line(self, content): + """Save reference data for the PRIMARY line.""" + self.data.primary.append(content) + + def primary(self, content): + pass + + def features_line(self, content): + """Get ready for the feature table when we reach the FEATURE line.""" + self.start_feature_table() + + def start_feature_table(self): + """Signal the start of the feature table.""" + # we need to add on the last reference + if self._cur_reference is not None: + self.data.references.append(self._cur_reference) + + def feature_key(self, content): + """Grab the key of the feature and signal the start of a new feature.""" + # first add on feature information if we've got any + self._add_feature() + + from . import Record + + self._cur_feature = Record.Feature() + self._cur_feature.key = content + + def _add_feature(self): + """Add a feature to the record, with relevant checks (PRIVATE). + + This does all of the appropriate checking to make sure we haven't + left any info behind, and that we are only adding info if it + exists. + """ + if self._cur_feature is not None: + # if we have a left over qualifier, add it to the qualifiers + # on the current feature + if self._cur_qualifier is not None: + self._cur_feature.qualifiers.append(self._cur_qualifier) + + self._cur_qualifier = None + self.data.features.append(self._cur_feature) + + def location(self, content): + self._cur_feature.location = self._clean_location(content) + + def feature_qualifier(self, key, value): + self.feature_qualifier_name([key]) + if value is not None: + self.feature_qualifier_description(value) + + def feature_qualifier_name(self, content_list): + """Deal with qualifier names. + + We receive a list of keys, since you can have valueless keys such as + /pseudo which would be passed in with the next key (since no other + tags separate them in the file) + """ + from . import Record + + for content in content_list: + # the record parser keeps the /s -- add them if we don't have 'em + if not content.startswith("/"): + content = "/%s" % content + # add on a qualifier if we've got one + if self._cur_qualifier is not None: + self._cur_feature.qualifiers.append(self._cur_qualifier) + + self._cur_qualifier = Record.Qualifier() + self._cur_qualifier.key = content + + def feature_qualifier_description(self, content): + # if we have info then the qualifier key should have a ='s + if "=" not in self._cur_qualifier.key: + self._cur_qualifier.key = "%s=" % self._cur_qualifier.key + cur_content = self._remove_newlines(content) + # remove all spaces from the value if it is a type where spaces + # are not important + for remove_space_key in self.__class__.remove_space_keys: + if remove_space_key in self._cur_qualifier.key: + cur_content = self._remove_spaces(cur_content) + self._cur_qualifier.value = self._normalize_spaces(cur_content) + + def base_count(self, content): + self.data.base_counts = content + + def origin_name(self, content): + self.data.origin = content + + def contig_location(self, content): + """Signal that we have contig information to add to the record.""" + self.data.contig = self._clean_location(content) + + def sequence(self, content): + """Add sequence information to a list of sequence strings. + + This removes spaces in the data and uppercases the sequence, and + then adds it to a list of sequences. Later on we'll join this + list together to make the final sequence. This is faster than + adding on the new string every time. + """ + assert " " not in content + self._seq_data.append(content.upper()) + + def record_end(self, content): + """Signal the end of the record and do any necessary clean-up.""" + # add together all of the sequence parts to create the + # final sequence string + self.data.sequence = "".join(self._seq_data) + # add on the last feature + self._add_feature() + + +def parse(handle): + """Iterate over GenBank formatted entries as Record objects. + + >>> from Bio import GenBank + >>> with open("GenBank/NC_000932.gb") as handle: + ... for record in GenBank.parse(handle): + ... print(record.accession) + ['NC_000932'] + + To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") + instead. + """ + return iter(Iterator(handle, RecordParser())) + + +def read(handle): + """Read a handle containing a single GenBank entry as a Record object. + + >>> from Bio import GenBank + >>> with open("GenBank/NC_000932.gb") as handle: + ... record = GenBank.read(handle) + ... print(record.accession) + ['NC_000932'] + + To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") + instead. + """ + iterator = parse(handle) + try: + record = next(iterator) + except StopIteration: + raise ValueError("No records found in handle") from None + try: + next(iterator) + raise ValueError("More than one record found in handle") + except StopIteration: + pass + return record + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/GenBank/__pycache__/Record.cpython-37.pyc b/code/lib/Bio/GenBank/__pycache__/Record.cpython-37.pyc new file mode 100644 index 0000000..862e2a0 Binary files /dev/null and b/code/lib/Bio/GenBank/__pycache__/Record.cpython-37.pyc differ diff --git a/code/lib/Bio/GenBank/__pycache__/Scanner.cpython-37.pyc b/code/lib/Bio/GenBank/__pycache__/Scanner.cpython-37.pyc new file mode 100644 index 0000000..24b0a53 Binary files /dev/null and b/code/lib/Bio/GenBank/__pycache__/Scanner.cpython-37.pyc differ diff --git a/code/lib/Bio/GenBank/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/GenBank/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..9d0e9c2 Binary files /dev/null and b/code/lib/Bio/GenBank/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/GenBank/__pycache__/utils.cpython-37.pyc b/code/lib/Bio/GenBank/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000..74c8727 Binary files /dev/null and b/code/lib/Bio/GenBank/__pycache__/utils.cpython-37.pyc differ diff --git a/code/lib/Bio/GenBank/utils.py b/code/lib/Bio/GenBank/utils.py new file mode 100644 index 0000000..6f0eb28 --- /dev/null +++ b/code/lib/Bio/GenBank/utils.py @@ -0,0 +1,68 @@ +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# + +"""Useful utilities for helping in parsing GenBank files.""" + + +class FeatureValueCleaner: + r"""Provide specialized capabilities for cleaning up values in features. + + This class is designed to provide a mechanism to clean up and process + values in the key/value pairs of GenBank features. This is useful + because in cases like:: + + /translation="MED + YDPWNLRFQSKYKSRDA" + + you'll otherwise end up with white space in it. + + This cleaning needs to be done on a case by case basis since it is + impossible to interpret whether you should be concatenating everything + (as in translations), or combining things with spaces (as might be + the case with /notes). + + >>> cleaner = FeatureValueCleaner(["translation"]) + >>> cleaner + FeatureValueCleaner(['translation']) + >>> cleaner.clean_value("translation", "MED\nYDPWNLRFQSKYKSRDA") + 'MEDYDPWNLRFQSKYKSRDA' + """ + + keys_to_process = ["translation"] + + def __init__(self, to_process=keys_to_process): + """Initialize with the keys we should deal with.""" + self._to_process = to_process + + def __repr__(self): + """Return a string representation of the class.""" + return f"{self.__class__.__name__}({self._to_process!r})" + + def clean_value(self, key_name, value): + """Clean the specified value and return it. + + If the value is not specified to be dealt with, the original value + will be returned. + """ + if key_name in self._to_process: + try: + cleaner = getattr(self, "_clean_%s" % key_name) + except AttributeError: + raise AssertionError( + "No function to clean key: %s" % key_name + ) from None + value = cleaner(value) + return value + + def _clean_translation(self, value): + """Concatenate a translation value to one long protein string (PRIVATE).""" + translation_parts = value.split() + return "".join(translation_parts) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/Geo/Record.py b/code/lib/Bio/Geo/Record.py new file mode 100644 index 0000000..5e38c78 --- /dev/null +++ b/code/lib/Bio/Geo/Record.py @@ -0,0 +1,92 @@ +# Copyright 2001 by Katharine Lindner. All rights reserved. +# Copyright 2006 by PeterC. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Hold GEO data in a straightforward format. + +classes: +o Record - All of the information in an GEO record. + +See http://www.ncbi.nlm.nih.gov/geo/ +""" + + +class Record: + """Hold GEO information in a format similar to the original record. + + The Record class is meant to make data easy to get to when you are + just interested in looking at GEO data. + + Attributes: + entity_type + entity_id + entity_attributes + col_defs + table_rows + + """ + + def __init__(self): + """Initialize the class.""" + self.entity_type = "" + self.entity_id = "" + self.entity_attributes = {} + self.col_defs = {} + self.table_rows = [] + + def __str__(self): + """Return the GEO record as a string.""" + output = "" + output += "GEO Type: %s\n" % self.entity_type + output += "GEO Id: %s\n" % self.entity_id + att_keys = sorted(self.entity_attributes) + for key in att_keys: + contents = self.entity_attributes[key] + if isinstance(contents, list): + for item in contents: + try: + output += "%s: %s\n" % (key, item[:40]) + output += out_block(item[40:]) + except Exception: # TODO: IndexError? + pass + elif isinstance(contents, str): + output += "%s: %s\n" % (key, contents[:40]) + output += out_block(contents[40:]) + else: + print(contents) + output += "%s: %s\n" % (key, contents[:40]) + output += out_block(contents[40:]) + col_keys = sorted(self.col_defs) + output += "Column Header Definitions\n" + for key in col_keys: + val = self.col_defs[key] + output += " %s: %s\n" % (key, val[:40]) + output += out_block(val[40:], " ") + # May have to display VERY large tables, + # so only show the first 20 lines of data + MAX_ROWS = 20 + 1 # include header in count + for row in self.table_rows[0:MAX_ROWS]: + output += "%s: " % self.table_rows.index(row) + for col in row: + output += "%s\t" % col + output += "\n" + if len(self.table_rows) > MAX_ROWS: + output += "...\n" + row = self.table_rows[-1] + output += "%s: " % self.table_rows.index(row) + for col in row: + output += "%s\t" % col + output += "\n" + + return output + + +def out_block(text, prefix=""): + """Format text in blocks of 80 chars with an additional optional prefix.""" + output = "" + for j in range(0, len(text), 80): + output += "%s%s\n" % (prefix, text[j : j + 80]) + output += "\n" + return output diff --git a/code/lib/Bio/Geo/__init__.py b/code/lib/Bio/Geo/__init__.py new file mode 100644 index 0000000..6735e9a --- /dev/null +++ b/code/lib/Bio/Geo/__init__.py @@ -0,0 +1,67 @@ +# Copyright 2001 by Katharine Lindner. All rights reserved. +# Copyright 2006 by PeterC. All rights reserved. +# Copyright 2007 by Michiel de Hoon. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Parser for files from NCBI's Gene Expression Omnibus (GEO). + +http://www.ncbi.nlm.nih.gov/geo/ +""" + +from . import Record + + +def _read_key_value(line): + words = line[1:].split("=", 1) + try: + key, value = words + value = value.strip() + except ValueError: + key = words[0] + value = "" + key = key.strip() + return key, value + + +def parse(handle): + """Read Gene Expression Omnibus records from file handle. + + Returns a generator object which yields Bio.Geo.Record() objects. + """ + record = None + for line in handle: + line = line.strip("\n").strip("\r") + if not line: + continue # Ignore empty lines + c = line[0] + if c == "^": + if record: + yield record + record = Record.Record() + record.entity_type, record.entity_id = _read_key_value(line) + elif c == "!": + if line in ( + "!Sample_table_begin", + "!Sample_table_end", + "!Platform_table_begin", + "!Platform_table_end", + ): + continue + key, value = _read_key_value(line) + if key in record.entity_attributes: + if isinstance(record.entity_attributes[key], list): + record.entity_attributes[key].append(value) + else: + existing = record.entity_attributes[key] + record.entity_attributes[key] = [existing, value] + else: + record.entity_attributes[key] = value + elif c == "#": + key, value = _read_key_value(line) + assert key not in record.col_defs + record.col_defs[key] = value + else: + row = line.split("\t") + record.table_rows.append(row) + yield record diff --git a/code/lib/Bio/Geo/__pycache__/Record.cpython-37.pyc b/code/lib/Bio/Geo/__pycache__/Record.cpython-37.pyc new file mode 100644 index 0000000..8861450 Binary files /dev/null and b/code/lib/Bio/Geo/__pycache__/Record.cpython-37.pyc differ diff --git a/code/lib/Bio/Geo/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Geo/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..1c5efce Binary files /dev/null and b/code/lib/Bio/Geo/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/BasicChromosome.py b/code/lib/Bio/Graphics/BasicChromosome.py new file mode 100644 index 0000000..91e6445 --- /dev/null +++ b/code/lib/Bio/Graphics/BasicChromosome.py @@ -0,0 +1,823 @@ +# Copyright 2001, 2003 by Brad Chapman. All rights reserved. +# Revisions copyright 2011 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Draw representations of organism chromosomes with added information. + +These classes are meant to model the drawing of pictures of chromosomes. +This can be useful for lots of things, including displaying markers on +a chromosome (ie. for genetic mapping) and showing syteny between two +chromosomes. + +The structure of these classes is intended to be a Composite, so that +it will be easy to plug in and switch different parts without +breaking the general drawing capabilities of the system. The +relationship between classes is that everything derives from +_ChromosomeComponent, which specifies the overall interface. The parts +then are related so that an Organism contains Chromosomes, and these +Chromosomes contain ChromosomeSegments. This representation differents +from the canonical composite structure in that we don't really have +'leaf' nodes here -- all components can potentially hold sub-components. + +Most of the time the ChromosomeSegment class is what you'll want to +customize for specific drawing tasks. + +For providing drawing capabilities, these classes use reportlab: + +http://www.reportlab.com + +This provides nice output in PDF, SVG and postscript. If you have +reportlab's renderPM module installed you can also use PNG etc. +""" + +# reportlab +from reportlab.lib.pagesizes import letter +from reportlab.lib.units import inch +from reportlab.lib import colors +from reportlab.pdfbase.pdfmetrics import stringWidth + +from reportlab.graphics.shapes import Drawing, String, Line, Rect, Wedge, ArcPath +from reportlab.graphics.widgetbase import Widget + +from Bio.Graphics import _write +from Bio.Graphics.GenomeDiagram import _Colors + + +_color_trans = _Colors.ColorTranslator() + + +class _ChromosomeComponent(Widget): + """Base class specifying the interface for a component of the system. + + This class should not be instantiated directly, but should be used + from derived classes. + """ + + def __init__(self): + """Initialize a chromosome component. + + Attributes: + - _sub_components -- Any components which are contained under + this parent component. This attribute should be accessed through + the add() and remove() functions. + + """ + self._sub_components = [] + + def add(self, component): + """Add a sub_component to the list of components under this item.""" + if not isinstance(component, _ChromosomeComponent): + raise TypeError( + "Expected a _ChromosomeComponent object, got %s" % component + ) + + self._sub_components.append(component) + + def remove(self, component): + """Remove the specified component from the subcomponents. + + Raises a ValueError if the component is not registered as a + sub_component. + """ + try: + self._sub_components.remove(component) + except ValueError: + raise ValueError( + "Component %s not found in sub_components." % component + ) from None + + def draw(self): + """Draw the specified component.""" + raise AssertionError("Subclasses must implement.") + + +class Organism(_ChromosomeComponent): + """Top level class for drawing chromosomes. + + This class holds information about an organism and all of its + chromosomes, and provides the top level object which could be used + for drawing a chromosome representation of an organism. + + Chromosomes should be added and removed from the Organism via the + add and remove functions. + """ + + def __init__(self, output_format="pdf"): + """Initialize the class.""" + _ChromosomeComponent.__init__(self) + + # customizable attributes + self.page_size = letter + self.title_size = 20 + + # Do we need this given we don't draw a legend? + # If so, should be a public API... + self._legend_height = 0 # 2 * inch + + self.output_format = output_format + + def draw(self, output_file, title): + """Draw out the information for the Organism. + + Arguments: + - output_file -- The name of a file specifying where the + document should be saved, or a handle to be written to. + The output format is set when creating the Organism object. + Alternatively, output_file=None will return the drawing using + the low-level ReportLab objects (for further processing, such + as adding additional graphics, before writing). + - title -- The output title of the produced document. + + """ + width, height = self.page_size + cur_drawing = Drawing(width, height) + + self._draw_title(cur_drawing, title, width, height) + + cur_x_pos = inch * 0.5 + if len(self._sub_components) > 0: + x_pos_change = (width - inch) / len(self._sub_components) + # no sub_components + else: + pass + + for sub_component in self._sub_components: + # set the drawing location of the chromosome + sub_component.start_x_position = cur_x_pos + 0.05 * x_pos_change + sub_component.end_x_position = cur_x_pos + 0.95 * x_pos_change + sub_component.start_y_position = height - 1.5 * inch + sub_component.end_y_position = self._legend_height + 1 * inch + + # do the drawing + sub_component.draw(cur_drawing) + + # update the locations for the next chromosome + cur_x_pos += x_pos_change + + self._draw_legend(cur_drawing, self._legend_height + 0.5 * inch, width) + + if output_file is None: + # Let the user take care of writing to the file... + return cur_drawing + + return _write(cur_drawing, output_file, self.output_format) + + def _draw_title(self, cur_drawing, title, width, height): + """Write out the title of the organism figure (PRIVATE).""" + title_string = String(width / 2, height - inch, title) + title_string.fontName = "Helvetica-Bold" + title_string.fontSize = self.title_size + title_string.textAnchor = "middle" + + cur_drawing.add(title_string) + + def _draw_legend(self, cur_drawing, start_y, width): + """Draw a legend for the figure (PRIVATE). + + Subclasses should implement this (see also self._legend_height) to + provide specialized legends. + """ + pass + + +class Chromosome(_ChromosomeComponent): + """Class for drawing a chromosome of an organism. + + This organizes the drawing of a single organisms chromosome. This + class can be instantiated directly, but the draw method makes the + most sense to be called in the context of an organism. + """ + + def __init__(self, chromosome_name): + """Initialize a Chromosome for drawing. + + Arguments: + - chromosome_name - The label for the chromosome. + + Attributes: + - start_x_position, end_x_position - The x positions on the page + where the chromosome should be drawn. This allows multiple + chromosomes to be drawn on a single page. + - start_y_position, end_y_position - The y positions on the page + where the chromosome should be contained. + + Configuration Attributes: + - title_size - The size of the chromosome title. + - scale_num - A number of scale the drawing by. This is useful if + you want to draw multiple chromosomes of different sizes at the + same scale. If this is not set, then the chromosome drawing will + be scaled by the number of segements in the chromosome (so each + chromosome will be the exact same final size). + + """ + _ChromosomeComponent.__init__(self) + + self._name = chromosome_name + + self.start_x_position = -1 + self.end_x_position = -1 + self.start_y_position = -1 + self.end_y_position = -1 + + self.title_size = 20 + self.scale_num = None + + self.label_size = 6 + self.chr_percent = 0.25 + self.label_sep_percent = self.chr_percent * 0.5 + self._color_labels = False + + def subcomponent_size(self): + """Return the scaled size of all subcomponents of this component.""" + total_sub = 0 + for sub_component in self._sub_components: + total_sub += sub_component.scale + + return total_sub + + def draw(self, cur_drawing): + """Draw a chromosome on the specified template. + + Ideally, the x_position and y_*_position attributes should be + set prior to drawing -- otherwise we're going to have some problems. + """ + for position in ( + self.start_x_position, + self.end_x_position, + self.start_y_position, + self.end_y_position, + ): + assert position != -1, "Need to set drawing coordinates." + + # first draw all of the sub-sections of the chromosome -- this + # will actually be the picture of the chromosome + cur_y_pos = self.start_y_position + if self.scale_num: + y_pos_change = ( + self.start_y_position * 0.95 - self.end_y_position + ) / self.scale_num + elif len(self._sub_components) > 0: + y_pos_change = ( + self.start_y_position * 0.95 - self.end_y_position + ) / self.subcomponent_size() + # no sub_components to draw + else: + pass + + left_labels = [] + right_labels = [] + for sub_component in self._sub_components: + this_y_pos_change = sub_component.scale * y_pos_change + + # set the location of the component to draw + sub_component.start_x_position = self.start_x_position + sub_component.end_x_position = self.end_x_position + sub_component.start_y_position = cur_y_pos + sub_component.end_y_position = cur_y_pos - this_y_pos_change + + # draw the sub component + sub_component._left_labels = [] + sub_component._right_labels = [] + sub_component.draw(cur_drawing) + left_labels += sub_component._left_labels + right_labels += sub_component._right_labels + + # update the position for the next component + cur_y_pos -= this_y_pos_change + + self._draw_labels(cur_drawing, left_labels, right_labels) + self._draw_label(cur_drawing, self._name) + + def _draw_label(self, cur_drawing, label_name): + """Draw a label for the chromosome (PRIVATE).""" + x_position = 0.5 * (self.start_x_position + self.end_x_position) + y_position = self.end_y_position + + label_string = String(x_position, y_position, label_name) + label_string.fontName = "Times-BoldItalic" + label_string.fontSize = self.title_size + label_string.textAnchor = "middle" + + cur_drawing.add(label_string) + + def _draw_labels(self, cur_drawing, left_labels, right_labels): + """Layout and draw sub-feature labels for the chromosome (PRIVATE). + + Tries to place each label at the same vertical position as the + feature it applies to, but will adjust the positions to avoid or + at least reduce label overlap. + + Draws the label text and a coloured line linking it to the + location (i.e. feature) it applies to. + """ + if not self._sub_components: + return + color_label = self._color_labels + + segment_width = (self.end_x_position - self.start_x_position) * self.chr_percent + label_sep = ( + self.end_x_position - self.start_x_position + ) * self.label_sep_percent + segment_x = self.start_x_position + 0.5 * ( + self.end_x_position - self.start_x_position - segment_width + ) + + y_limits = [] + for sub_component in self._sub_components: + y_limits.extend( + (sub_component.start_y_position, sub_component.end_y_position) + ) + y_min = min(y_limits) + y_max = max(y_limits) + del y_limits + # Now do some label placement magic... + # from reportlab.pdfbase import pdfmetrics + # font = pdfmetrics.getFont('Helvetica') + # h = (font.face.ascent + font.face.descent) * 0.90 + h = self.label_size + for x1, x2, labels, anchor in [ + ( + segment_x, + segment_x - label_sep, + _place_labels(left_labels, y_min, y_max, h), + "end", + ), + ( + segment_x + segment_width, + segment_x + segment_width + label_sep, + _place_labels(right_labels, y_min, y_max, h), + "start", + ), + ]: + for (y1, y2, color, back_color, name) in labels: + cur_drawing.add( + Line(x1, y1, x2, y2, strokeColor=color, strokeWidth=0.25) + ) + label_string = String(x2, y2, name, textAnchor=anchor) + label_string.fontName = "Helvetica" + label_string.fontSize = h + if color_label: + label_string.fillColor = color + if back_color: + w = stringWidth(name, label_string.fontName, label_string.fontSize) + if x1 > x2: + w = w * -1.0 + cur_drawing.add( + Rect( + x2, + y2 - 0.1 * h, + w, + h, + strokeColor=back_color, + fillColor=back_color, + ) + ) + cur_drawing.add(label_string) + + +class ChromosomeSegment(_ChromosomeComponent): + """Draw a segment of a chromosome. + + This class provides the important configurable functionality of drawing + a Chromosome. Each segment has some customization available here, or can + be subclassed to define additional functionality. Most of the interesting + drawing stuff is likely to happen at the ChromosomeSegment level. + """ + + def __init__(self): + """Initialize a ChromosomeSegment. + + Attributes: + - start_x_position, end_x_position - Defines the x range we have + to draw things in. + - start_y_position, end_y_position - Defines the y range we have + to draw things in. + + Configuration Attributes: + - scale - A scaling value for the component. By default this is + set at 1 (ie -- has the same scale as everything else). Higher + values give more size to the component, smaller values give less. + - fill_color - A color to fill in the segment with. Colors are + available in reportlab.lib.colors + - label - A label to place on the chromosome segment. This should + be a text string specifying what is to be included in the label. + - label_size - The size of the label. + - chr_percent - The percentage of area that the chromosome + segment takes up. + + """ + _ChromosomeComponent.__init__(self) + + self.start_x_position = -1 + self.end_x_position = -1 + self.start_y_position = -1 + self.end_y_position = -1 + + # --- attributes for configuration + self.scale = 1 + self.fill_color = None + self.label = None + self.label_size = 6 + self.chr_percent = 0.25 + + def draw(self, cur_drawing): + """Draw a chromosome segment. + + Before drawing, the range we are drawing in needs to be set. + """ + for position in ( + self.start_x_position, + self.end_x_position, + self.start_y_position, + self.end_y_position, + ): + assert position != -1, "Need to set drawing coordinates." + + self._draw_subcomponents(cur_drawing) # Anything behind + self._draw_segment(cur_drawing) + self._overdraw_subcomponents(cur_drawing) # Anything on top + self._draw_label(cur_drawing) + + def _draw_subcomponents(self, cur_drawing): + """Draw any subcomponents of the chromosome segment (PRIVATE). + + This should be overridden in derived classes if there are + subcomponents to be drawn. + """ + pass + + def _draw_segment(self, cur_drawing): + """Draw the current chromosome segment (PRIVATE).""" + # set the coordinates of the segment -- it'll take up the MIDDLE part + # of the space we have. + segment_y = self.end_y_position + segment_width = (self.end_x_position - self.start_x_position) * self.chr_percent + segment_height = self.start_y_position - self.end_y_position + segment_x = self.start_x_position + 0.5 * ( + self.end_x_position - self.start_x_position - segment_width + ) + + # first draw the sides of the segment + right_line = Line(segment_x, segment_y, segment_x, segment_y + segment_height) + left_line = Line( + segment_x + segment_width, + segment_y, + segment_x + segment_width, + segment_y + segment_height, + ) + + cur_drawing.add(right_line) + cur_drawing.add(left_line) + + # now draw the box, if it is filled in + if self.fill_color is not None: + fill_rectangle = Rect(segment_x, segment_y, segment_width, segment_height) + fill_rectangle.fillColor = self.fill_color + fill_rectangle.strokeColor = None + + cur_drawing.add(fill_rectangle) + + def _overdraw_subcomponents(self, cur_drawing): + """Draw any subcomponents of the chromosome segment over the main part (PRIVATE). + + This should be overridden in derived classes if there are + subcomponents to be drawn. + """ + pass + + def _draw_label(self, cur_drawing): + """Add a label to the chromosome segment (PRIVATE). + + The label will be applied to the right of the segment. + + This may be overlapped by any sub-feature labels on other segments! + """ + if self.label is not None: + + label_x = 0.5 * (self.start_x_position + self.end_x_position) + ( + self.chr_percent + 0.05 + ) * (self.end_x_position - self.start_x_position) + label_y = ( + self.start_y_position - self.end_y_position + ) / 2 + self.end_y_position + + label_string = String(label_x, label_y, self.label) + label_string.fontName = "Helvetica" + label_string.fontSize = self.label_size + + cur_drawing.add(label_string) + + +def _spring_layout(desired, minimum, maximum, gap=0): + """Try to layout label co-ordinates or other floats (PRIVATE). + + Originally written for the y-axis vertical positioning of labels on a + chromosome diagram (where the minimum gap between y-axis co-ordinates is + the label height), it could also potentially be used for x-axis placement, + or indeed radial placement for circular chromosomes within GenomeDiagram. + + In essence this is an optimisation problem, balancing the desire to have + each label as close as possible to its data point, but also to spread out + the labels to avoid overlaps. This could be described with a cost function + (modelling the label distance from the desired placement, and the inter- + label separations as springs) and solved as a multi-variable minimization + problem - perhaps with NumPy or SciPy. + + For now however, the implementation is a somewhat crude ad hoc algorithm. + + NOTE - This expects the input data to have been sorted! + """ + count = len(desired) + if count <= 1: + return desired # Easy! + if minimum >= maximum: + raise ValueError("Bad min/max %f and %f" % (minimum, maximum)) + if min(desired) < minimum or max(desired) > maximum: + raise ValueError( + "Data %f to %f out of bounds (%f to %f)" + % (min(desired), max(desired), minimum, maximum) + ) + equal_step = float(maximum - minimum) / (count - 1) + + if equal_step < gap: + import warnings + from Bio import BiopythonWarning + + warnings.warn("Too many labels to avoid overlap", BiopythonWarning) + # Crudest solution + return [minimum + i * equal_step for i in range(count)] + + good = True + if gap: + prev = desired[0] + for next in desired[1:]: + if prev - next < gap: + good = False + break + if good: + return desired + + span = maximum - minimum + for split in [0.5 * span, span / 3.0, 2 * span / 3.0, 0.25 * span, 0.75 * span]: + midpoint = minimum + split + low = [x for x in desired if x <= midpoint - 0.5 * gap] + high = [x for x in desired if x > midpoint + 0.5 * gap] + if len(low) + len(high) < count: + # Bad split point, points right on boundary + continue + elif not low and len(high) * gap <= (span - split) + 0.5 * gap: + # Give a little of the unused low space to the high points + return _spring_layout(high, midpoint + 0.5 * gap, maximum, gap) + elif not high and len(low) * gap <= split + 0.5 * gap: + # Give a little of the unused highspace to the low points + return _spring_layout(low, minimum, midpoint - 0.5 * gap, gap) + elif ( + len(low) * gap <= split - 0.5 * gap + and len(high) * gap <= (span - split) - 0.5 * gap + ): + return _spring_layout( + low, minimum, midpoint - 0.5 * gap, gap + ) + _spring_layout(high, midpoint + 0.5 * gap, maximum, gap) + + # This can be count-productive now we can split out into the telomere or + # spacer-segment's vertical space... + # Try not to spread out as far as the min/max unless needed + low = min(desired) + high = max(desired) + if (high - low) / (count - 1) >= gap: + # Good, we don't need the full range, and can position the + # min and max exactly as well :) + equal_step = (high - low) / (count - 1) + return [low + i * equal_step for i in range(count)] + + low = 0.5 * (minimum + min(desired)) + high = 0.5 * (max(desired) + maximum) + if (high - low) / (count - 1) >= gap: + # Good, we don't need the full range + equal_step = (high - low) / (count - 1) + return [low + i * equal_step for i in range(count)] + + # Crudest solution + return [minimum + i * equal_step for i in range(count)] + + +# assert False, _spring_layout([0.10,0.12,0.13,0.14,0.5,0.75, 1.0], 0, 1, 0.1) +# assert _spring_layout([0.10,0.12,0.13,0.14,0.5,0.75, 1.0], 0, 1, 0.1) == \ +# [0.0, 0.125, 0.25, 0.375, 0.5, 0.75, 1.0] +# assert _spring_layout([0.10,0.12,0.13,0.14,0.5,0.75, 1.0], 0, 1, 0.1) == \ +# [0.0, 0.16666666666666666, 0.33333333333333331, 0.5, +# 0.66666666666666663, 0.83333333333333326, 1.0] + + +def _place_labels(desired_etc, minimum, maximum, gap=0): + # Want a list of lists/tuples for desired_etc + desired_etc.sort() + placed = _spring_layout([row[0] for row in desired_etc], minimum, maximum, gap) + for old, y2 in zip(desired_etc, placed): + # (y1, a, b, c, ..., z) --> (y1, y2, a, b, c, ..., z) + yield (old[0], y2) + tuple(old[1:]) + + +class AnnotatedChromosomeSegment(ChromosomeSegment): + """Annotated chromosome segment. + + This is like the ChromosomeSegment, but accepts a list of features. + """ + + def __init__( + self, + bp_length, + features, + default_feature_color=colors.blue, + name_qualifiers=("gene", "label", "name", "locus_tag", "product"), + ): + """Initialize. + + The features can either be SeqFeature objects, or tuples of values: + start (int), end (int), strand (+1, -1, O or None), label (string), + ReportLab color (string or object), and optional ReportLab fill color. + + Note we require 0 <= start <= end <= bp_length, and within the vertical + space allocated to this segmenet lines will be places according to the + start/end coordinates (starting from the top). + + Positive stand features are drawn on the right, negative on the left, + otherwise all the way across. + + We recommend using consisent units for all the segment's scale values + (e.g. their length in base pairs). + + When providing features as SeqFeature objects, the default color + is used, unless the feature's qualifiers include an Artemis colour + string (functionality also in GenomeDiagram). The caption also follows + the GenomeDiagram approach and takes the first qualifier from the list + or tuple specified in name_qualifiers. + + Note additional attribute label_sep_percent controls the percentage of + area that the chromosome segment takes up, by default half of the + chr_percent attribute (half of 25%, thus 12.5%) + + """ + ChromosomeSegment.__init__(self) + self.bp_length = bp_length + self.features = features + self.default_feature_color = default_feature_color + self.name_qualifiers = name_qualifiers + self.label_sep_percent = self.chr_percent * 0.5 + + def _overdraw_subcomponents(self, cur_drawing): + """Draw any annotated features on the chromosome segment (PRIVATE). + + Assumes _draw_segment already called to fill out the basic shape, + and assmes that uses the same boundaries. + """ + # set the coordinates of the segment -- it'll take up the MIDDLE part + # of the space we have. + segment_y = self.end_y_position + segment_width = (self.end_x_position - self.start_x_position) * self.chr_percent + label_sep = ( + self.end_x_position - self.start_x_position + ) * self.label_sep_percent + segment_height = self.start_y_position - self.end_y_position + segment_x = self.start_x_position + 0.5 * ( + self.end_x_position - self.start_x_position - segment_width + ) + + left_labels = [] + right_labels = [] + for f in self.features: + try: + # Assume SeqFeature objects + start = f.location.start + end = f.location.end + strand = f.strand + try: + # Handles Artemis colour integers, HTML colors, etc + color = _color_trans.translate(f.qualifiers["color"][0]) + except Exception: # TODO: ValueError? + color = self.default_feature_color + fill_color = color + name = "" + for qualifier in self.name_qualifiers: + if qualifier in f.qualifiers: + name = f.qualifiers[qualifier][0] + break + except AttributeError: + # Assume tuple of ints, string, and color + start, end, strand, name, color = f[:5] + color = _color_trans.translate(color) + if len(f) > 5: + fill_color = _color_trans.translate(f[5]) + else: + fill_color = color + assert 0 <= start <= end <= self.bp_length + if strand == +1: + # Right side only + x = segment_x + segment_width * 0.6 + w = segment_width * 0.4 + elif strand == -1: + # Left side only + x = segment_x + w = segment_width * 0.4 + else: + # Both or neither - full width + x = segment_x + w = segment_width + local_scale = segment_height / self.bp_length + fill_rectangle = Rect( + x, + segment_y + segment_height - local_scale * start, + w, + local_scale * (start - end), + ) + fill_rectangle.fillColor = fill_color + fill_rectangle.strokeColor = color + cur_drawing.add(fill_rectangle) + if name: + if fill_color == color: + back_color = None + else: + back_color = fill_color + value = ( + segment_y + segment_height - local_scale * start, + color, + back_color, + name, + ) + if strand == -1: + self._left_labels.append(value) + else: + self._right_labels.append(value) + + +class TelomereSegment(ChromosomeSegment): + """A segment that is located at the end of a linear chromosome. + + This is just like a regular segment, but it draws the end of a chromosome + which is represented by a half circle. This just overrides the + _draw_segment class of ChromosomeSegment to provide that specialized + drawing. + """ + + def __init__(self, inverted=0): + """Initialize a segment at the end of a chromosome. + + See ChromosomeSegment for all of the attributes that can be + customized in a TelomereSegments. + + Arguments: + - inverted -- Whether or not the telomere should be inverted + (ie. drawn on the bottom of a chromosome) + + """ + ChromosomeSegment.__init__(self) + + self._inverted = inverted + + def _draw_segment(self, cur_drawing): + """Draw a half circle representing the end of a linear chromosome (PRIVATE).""" + # set the coordinates of the segment -- it'll take up the MIDDLE part + # of the space we have. + width = (self.end_x_position - self.start_x_position) * self.chr_percent + height = self.start_y_position - self.end_y_position + center_x = 0.5 * (self.end_x_position + self.start_x_position) + start_x = center_x - 0.5 * width + if self._inverted: + center_y = self.start_y_position + start_angle = 180 + end_angle = 360 + else: + center_y = self.end_y_position + start_angle = 0 + end_angle = 180 + + cap_wedge = Wedge(center_x, center_y, width / 2, start_angle, end_angle, height) + cap_wedge.strokeColor = None + cap_wedge.fillColor = self.fill_color + cur_drawing.add(cap_wedge) + + # Now draw an arc for the curved edge of the wedge, + # omitting the flat end. + cap_arc = ArcPath() + cap_arc.addArc(center_x, center_y, width / 2, start_angle, end_angle, height) + cur_drawing.add(cap_arc) + + +class SpacerSegment(ChromosomeSegment): + """A segment that is located at the end of a linear chromosome. + + Doesn't draw anything, just empty space which can be helpful + for layout purposes (e.g. making room for feature labels). + """ + + def draw(self, cur_diagram): + """Draw nothing to the current diagram (dummy method). + + The segment spacer has no actual image in the diagram, + so this method therefore does nothing, but is defined + to match the expected API of the other segment objects. + """ + pass diff --git a/code/lib/Bio/Graphics/ColorSpiral.py b/code/lib/Bio/Graphics/ColorSpiral.py new file mode 100644 index 0000000..c113b7a --- /dev/null +++ b/code/lib/Bio/Graphics/ColorSpiral.py @@ -0,0 +1,206 @@ +# Copyright 2012 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Generate RGB colours suitable for distinguishing categorical data. + +This module provides a class that implements a spiral 'path' through HSV +colour space, permitting the selection of a number of points along that path, +and returning the output in RGB colour space, suitable for use with ReportLab +and other graphics packages. + +This approach to colour choice was inspired by Bang Wong's Points of View +article: Color Coding, in Nature Methods _7_ 573 (https://doi.org/10.1038/nmeth0810-573). + +The module also provides helper functions that return a list for colours, or +a dictionary of colours (if passed an iterable containing the names of +categories to be coloured). +""" + +# standard library +import colorsys # colour format conversions +from math import log, exp, floor, pi +import random # for jitter values + + +class ColorSpiral: + """Implement a spiral path through HSV colour space. + + This class provides functions for sampling points along a logarithmic + spiral path through HSV colour space. + + The spiral is described by r = a * exp(b * t) where r is the distance + from the axis of the HSV cylinder to the current point in the spiral, + and t is the angle through which the spiral has turned to reach the + current point. a and b are (positive, real) parameters that control the + shape of the spiral. + + - a: the starting direction of the spiral + - b: the number of revolutions about the axis made by the spiral + + We permit the spiral to move along the cylinder ('in V-space') between + v_init and v_final, to give a gradation in V (essentially, brightness), + along the path, where v_init, v_final are in [0,1]. + + A brightness 'jitter' may also be provided as an absolute value in + V-space, to aid in distinguishing consecutive colour points on the + path. + """ + + def __init__(self, a=1, b=0.33, v_init=0.85, v_final=0.5, jitter=0.05): + """Initialize a logarithmic spiral path through HSV colour space. + + Arguments: + - a - Parameter a for the spiral, controls the initial spiral + direction. a > 0 + - b - parameter b for the spiral, controls the rate at which the + spiral revolves around the axis. b > 0 + - v_init - initial value of V (brightness) for the spiral. + v_init in [0,1] + - v_final - final value of V (brightness) for the spiral + v_final in [0,1] + - jitter - the degree of V (brightness) jitter to add to each + selected colour. The amount of jitter will be selected + from a uniform random distribution [-jitter, jitter], + and V will be maintained in [0,1]. + + """ + # Initialize attributes + self.a = a + self.b = b + self.v_init = v_init + self.v_final = v_final + self.jitter = jitter + + def get_colors(self, k, offset=0.1): + """Generate k different RBG colours evenly-space on the spiral. + + A generator returning the RGB colour space values for k + evenly-spaced points along the defined spiral in HSV space. + + Arguments: + - k - the number of points to return + - offset - how far along the spiral path to start. + + """ + # We use the offset to skip a number of similar colours near to HSV axis + assert offset > 0 and offset < 1, "offset must be in (0,1)" + v_rate = (self._v_final - self._v_init) / float(k) + # Generator for colours: we have divided the arc length into sections + # of equal length, and step along them + for n in range(1, k + 1): + # For each value of n, t indicates the angle through which the + # spiral has turned, to this point + t = (1.0 / self._b) * ( + log(n + (k * offset)) - log((1 + offset) * k * self._a) + ) + # Put 0 <= h <= 2*pi, where h is the angular part of the polar + # co-ordinates for this point on the spiral + h = t + while h < 0: + h += 2 * pi + h = h - (floor(h / (2 * pi)) * pi) + # Now put h in [0, 1] for colorsys conversion + h = h / (2 * pi) + # r is the radial distance of this point from the centre + r = self._a * exp(self._b * t) + # v is the brightness of this point, linearly interpolated + # from self._v_init to self._v_final. Jitter size is sampled from + # a uniform distribution + if self._jitter: + jitter = random.random() * 2 * self._jitter - self._jitter + else: + jitter = 0 + v = self._v_init + (n * v_rate + jitter) + # We have arranged the arithmetic such that 0 <= r <= 1, so + # we can use this value directly as s in HSV + yield colorsys.hsv_to_rgb(h, r, max(0, min(v, 1))) + + def _get_a(self): + return self._a + + def _set_a(self, value): + self._a = max(0, value) + + def _get_b(self): + return self._b + + def _set_b(self, value): + self._b = max(0, value) + + def _get_v_init(self): + return self._v_init + + def _set_v_init(self, value): + self._v_init = max(0, min(1, value)) + + def _get_v_final(self): + return self._v_final + + def _set_v_final(self, value): + self._v_final = max(0, min(1, value)) + + def _get_jitter(self): + return self._jitter + + def _set_jitter(self, value): + self._jitter = max(0, min(1, value)) + + a = property( + _get_a, _set_a, doc="Parameter controlling initial spiral direction (a > 0)" + ) + b = property( + _get_b, + _set_b, + doc="Parameter controlling rate spiral revolves around axis (b > 0)", + ) + v_init = property( + _get_v_init, + _set_v_init, + doc="Initial value of V (brightness) for the spiral (range 0 to 1)", + ) + v_final = property( + _get_v_final, + _set_v_final, + doc="Final value of V (brightness) for the spiral (range 0 to 1)", + ) + jitter = property( + _get_jitter, + _set_jitter, + doc="Degree of V (brightness) jitter to add to each color (range 0 to 1)", + ) + + +# Convenience functions for those who don't want to bother with a +# ColorSpiral object +def get_colors(k, **kwargs): + """Return k colours selected by the ColorSpiral object, as a generator. + + Arguments: + - k - the number of colours to return + - kwargs - pass-through arguments to the ColorSpiral object + + """ + cs = ColorSpiral(**kwargs) + return cs.get_colors(k) + + +def get_color_dict(l, **kwargs): + """Return a dictionary of colours using the provided values as keys. + + Returns a dictionary, keyed by the members of iterable l, with a + colour assigned to each member. + + Arguments: + - l - an iterable representing classes to be coloured + - kwargs - pass-through arguments to the ColorSpiral object + + """ + cs = ColorSpiral(**kwargs) + colors = cs.get_colors(len(l)) + dict = {} + for item in l: + dict[item] = next(colors) + return dict diff --git a/code/lib/Bio/Graphics/Comparative.py b/code/lib/Bio/Graphics/Comparative.py new file mode 100644 index 0000000..35bc192 --- /dev/null +++ b/code/lib/Bio/Graphics/Comparative.py @@ -0,0 +1,178 @@ +# Copyright 2001 by Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Plots to compare information between different sources. + +This file contains high level plots which are designed to be used to +compare different types of information. The most basic example is comparing +two variables in a traditional scatter plot. +""" +# reportlab +from reportlab.lib import colors +from reportlab.graphics.charts.lineplots import LinePlot +from reportlab.lib.pagesizes import letter +from reportlab.lib.units import inch + +from reportlab.graphics.shapes import Drawing, String +from reportlab.graphics.charts.markers import makeEmptySquare, makeFilledSquare +from reportlab.graphics.charts.markers import makeFilledDiamond, makeSmiley +from reportlab.graphics.charts.markers import makeFilledCircle, makeEmptyCircle + +from Bio.Graphics import _write + + +class ComparativeScatterPlot: + """Display a scatter-type plot comparing two different kinds of info. + + Attributes; + - display_info - a 2D list of the information we'll be outputting. Each + top level list is a different data type, and each data point is a + two-tuple of the coordinates of a point. + + So if you had two distributions of points, it should look like:: + + display_info = [[(1, 2), (3, 4)], + [(5, 6), (7, 8)]] + + If everything is just one set of points, display_info can look like:: + + display_info = [[(1, 2), (3, 4), (5, 6)]] + + """ + + def __init__(self, output_format="pdf"): + """Initialize the class.""" + # customizable attributes + self.number_of_columns = 1 + self.page_size = letter + self.title_size = 20 + + self.output_format = output_format + + # the information we'll be writing + self.display_info = [] + + # initial colors and shapes used for drawing points + self.color_choices = [ + colors.red, + colors.green, + colors.blue, + colors.yellow, + colors.orange, + colors.black, + ] + self.shape_choices = [ + makeFilledCircle, + makeEmptySquare, + makeFilledDiamond, + makeFilledSquare, + makeEmptyCircle, + makeSmiley, + ] + + def draw_to_file(self, output_file, title): + """Write the comparative plot to a file. + + Arguments: + - output_file - The name of the file to output the information to, + or a handle to write to. + - title - A title to display on the graphic. + + """ + width, height = self.page_size + cur_drawing = Drawing(width, height) + + self._draw_title(cur_drawing, title, width, height) + + start_x = inch * 0.5 + end_x = width - inch * 0.5 + end_y = height - 1.5 * inch + start_y = 0.5 * inch + self._draw_scatter_plot(cur_drawing, start_x, start_y, end_x, end_y) + + return _write(cur_drawing, output_file, self.output_format) + + def _draw_title(self, cur_drawing, title, width, height): + """Add a title to the page we are outputting (PRIVATE).""" + title_string = String(width / 2, height - inch, title) + title_string.fontName = "Helvetica-Bold" + title_string.fontSize = self.title_size + title_string.textAnchor = "middle" + + cur_drawing.add(title_string) + + def _draw_scatter_plot(self, cur_drawing, x_start, y_start, x_end, y_end): + """Draw a scatter plot on the drawing with the given coordinates (PRIVATE).""" + scatter_plot = LinePlot() + + # set the dimensions of the scatter plot + scatter_plot.x = x_start + scatter_plot.y = y_start + scatter_plot.width = abs(x_start - x_end) + scatter_plot.height = abs(y_start - y_end) + + scatter_plot.data = self.display_info + + scatter_plot.joinedLines = 0 + + # set the axes of the plot + x_min, x_max, y_min, y_max = self._find_min_max(self.display_info) + scatter_plot.xValueAxis.valueMin = x_min + scatter_plot.xValueAxis.valueMax = x_max + scatter_plot.xValueAxis.valueStep = (x_max - x_min) / 10.0 + + scatter_plot.yValueAxis.valueMin = y_min + scatter_plot.yValueAxis.valueMax = y_max + scatter_plot.yValueAxis.valueStep = (y_max - y_min) / 10.0 + + self._set_colors_and_shapes(scatter_plot, self.display_info) + + cur_drawing.add(scatter_plot) + + def _set_colors_and_shapes(self, scatter_plot, display_info): + """Set the colors and shapes of the points displayed (PRIVATE). + + By default this just sets all of the points according to the order + of colors and shapes defined in self.color_choices and + self.shape_choices. The first 5 shapes and colors are unique, the + rest of them are just set to the same color and shape (since I + ran out of shapes!). + + You can change how this function works by either changing the + values of the color_choices and shape_choices attributes, or + by inheriting from this class and overriding this function. + """ + for value_num in range(len(display_info)): + # if we have unique colors, add them + if (value_num + 1) < len(self.color_choices): + scatter_plot.lines[value_num].strokeColor = self.color_choices[ + value_num + ] + scatter_plot.lines[value_num].symbol = self.shape_choices[value_num] + # otherwise just use the last number + else: + scatter_plot.lines[value_num].strokeColor = self.color_choices[-1] + scatter_plot.lines[value_num].symbol = self.shape_choices[-1] + + def _find_min_max(self, info): + """Find min and max for x and y coordinates in the given data (PRIVATE).""" + x_min = info[0][0][0] + x_max = info[0][0][0] + y_min = info[0][0][1] + y_max = info[0][0][1] + + for two_d_list in info: + for x, y in two_d_list: + if x > x_max: + x_max = x + if x < x_min: + x_min = x + if y > y_max: + y_max = y + if y < y_min: + y_min = y + + return x_min, x_max, y_min, y_max diff --git a/code/lib/Bio/Graphics/DisplayRepresentation.py b/code/lib/Bio/Graphics/DisplayRepresentation.py new file mode 100644 index 0000000..df75283 --- /dev/null +++ b/code/lib/Bio/Graphics/DisplayRepresentation.py @@ -0,0 +1,187 @@ +# Copyright 2001 by Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Represent information for graphical display. + +Classes in this module are designed to hold information in a way that +makes it easy to draw graphical figures. +""" +# reportlab +from reportlab.lib import colors + +# local stuff +from Bio.Graphics.BasicChromosome import ChromosomeSegment +from Bio.Graphics.BasicChromosome import TelomereSegment + + +# --- constants +# This is a default color scheme based on the light spectrum. +# Based on my vague recollections from biology, this is our friend ROY G. BIV +RAINBOW_COLORS = { + (1, 1): colors.violet, + (2, 2): colors.indigo, + (3, 3): colors.blue, + (4, 4): colors.green, + (5, 5): colors.yellow, + (6, 6): colors.orange, + (7, 20): colors.red, +} + + +class ChromosomeCounts: + """Represent a chromosome with count information. + + This is used to display information about counts along a chromosome. + The segments are expected to have different count information, which + will be displayed using a color scheme. + + I envision using this class when you think that certain regions of + the chromosome will be especially abundant in the counts, and you + want to pick those out. + """ + + def __init__(self, segment_names, color_scheme=RAINBOW_COLORS): + """Initialize a representation of chromosome counts. + + Arguments: + - segment_names - An ordered list of all segment names along + the chromosome. The count and other information will be added + to these. + - color_scheme - A coloring scheme to use in the counts. This + should be a dictionary mapping count ranges to colors (specified + in reportlab.lib.colors). + + """ + self._names = segment_names + self._count_info = {} + self._label_info = {} + self._scale_info = {} + for name in self._names: + self._count_info[name] = 0 + self._label_info[name] = None + self._scale_info[name] = 1 + + self._color_scheme = color_scheme + + def add_count(self, segment_name, count=1): + """Add counts to the given segment name. + + Arguments: + - segment_name - The name of the segment we should add counts to. + If the name is not present, a KeyError will be raised. + - count - The counts to add the current segment. This defaults to + a single count. + + """ + try: + self._count_info[segment_name] += count + except KeyError: + raise KeyError("Segment name %s not found." % segment_name) from None + + def scale_segment_value(self, segment_name, scale_value=None): + """Divide the counts for a segment by some kind of scale value. + + This is useful if segments aren't represented by raw counts, but + are instead counts divided by some number. + """ + try: + self._count_info[segment_name] = float( + self._count_info[segment_name] + ) / float(scale_value) + except KeyError: + raise KeyError("Segment name %s not found." % segment_name) from None + + def add_label(self, segment_name, label): + """Add a label to a specific segment. + + Raises a KeyError is the specified segment name is not found. + """ + if segment_name in self._label_info: + self._label_info[segment_name] = label + else: + raise KeyError("Segment name %s not found." % segment_name) + + def set_scale(self, segment_name, scale): + """Set the scale for a specific chromosome segment. + + By default all segments have the same scale -- this allows scaling + by the size of the segment. + + Raises a KeyError is the specified segment name is not found. + """ + if segment_name in self._label_info: + self._scale_info[segment_name] = scale + else: + raise KeyError("Segment name %s not found." % segment_name) + + def get_segment_info(self): + """Retrieve the color and label info about the segments. + + Returns a list consiting of two tuples specifying the counts and + label name for each segment. The list is ordered according to the + original listing of names. Labels are set as None if no label + was specified. + """ + order_info = [] + + for seg_name in self._names: + order_info.append((self._count_info[seg_name], self._label_info[seg_name])) + + return order_info + + def fill_chromosome(self, chromosome): + """Add the collected segment information to a chromosome for drawing. + + Arguments: + - chromosome - A Chromosome graphics object that we can add + chromosome segments to. + + This creates ChromosomeSegment (and TelomereSegment) objects to + fill in the chromosome. The information is derived from the + label and count information, with counts transformed to the + specified color map. + + Returns the chromosome with all of the segments added. + """ + for seg_num in range(len(self._names)): + is_end_segment = 0 + # make the top and bottom telomeres + if seg_num == 0: + cur_segment = TelomereSegment() + is_end_segment = 1 + elif seg_num == len(self._names) - 1: + cur_segment = TelomereSegment(1) + is_end_segment = 1 + # otherwise, they are just regular segments + else: + cur_segment = ChromosomeSegment() + + seg_name = self._names[seg_num] + if self._count_info[seg_name] > 0: + color = self._color_from_count(self._count_info[seg_name]) + cur_segment.fill_color = color + + if self._label_info[seg_name] is not None: + cur_segment.label = self._label_info[seg_name] + + # give end segments extra size so they look right + if is_end_segment: + cur_segment.scale = 3 + else: + cur_segment.scale = self._scale_info[seg_name] + + chromosome.add(cur_segment) + + return chromosome + + def _color_from_count(self, count): + """Translate the given count into a color using the color scheme (PRIVATE).""" + for count_start, count_end in self._color_scheme: + if count >= count_start and count <= count_end: + return self._color_scheme[(count_start, count_end)] + + # if we got here we didn't find a color for the count + raise ValueError("Count value %s was not found in the color scheme." % count) diff --git a/code/lib/Bio/Graphics/Distribution.py b/code/lib/Bio/Graphics/Distribution.py new file mode 100644 index 0000000..3bfb065 --- /dev/null +++ b/code/lib/Bio/Graphics/Distribution.py @@ -0,0 +1,258 @@ +# Copyright 2001 by Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Display information distributed across a Chromosome-like object. + +These classes are meant to show the distribution of some kind of information +as it changes across any kind of segment. It was designed with chromosome +distributions in mind, but could also work for chromosome regions, BAC clones +or anything similar. + +Reportlab is used for producing the graphical output. +""" +# standard library +import math + +# reportlab +from reportlab.lib.pagesizes import letter +from reportlab.lib.units import inch +from reportlab.lib import colors + +from reportlab.graphics.shapes import Drawing, String +from reportlab.graphics.charts.barcharts import VerticalBarChart +from reportlab.graphics.charts.barcharts import BarChartProperties +from reportlab.graphics.widgetbase import TypedPropertyCollection + +from Bio.Graphics import _write + + +class DistributionPage: + """Display a grouping of distributions on a page. + + This organizes Distributions, and will display them nicely + on a single page. + """ + + def __init__(self, output_format="pdf"): + """Initialize the class.""" + self.distributions = [] + + # customizable attributes + self.number_of_columns = 1 + self.page_size = letter + self.title_size = 20 + + self.output_format = output_format + + def draw(self, output_file, title): + """Draw out the distribution information. + + Arguments: + - output_file - The name of the file to output the information to, + or a handle to write to. + - title - A title to display on the graphic. + + """ + width, height = self.page_size + cur_drawing = Drawing(width, height) + + self._draw_title(cur_drawing, title, width, height) + + # calculate the x and y position changes for each distribution + cur_x_pos = inch * 0.5 + end_x_pos = width - inch * 0.5 + cur_y_pos = height - 1.5 * inch + end_y_pos = 0.5 * inch + x_pos_change = (end_x_pos - cur_x_pos) / float(self.number_of_columns) + num_y_rows = math.ceil( + float(len(self.distributions)) / float(self.number_of_columns) + ) + y_pos_change = (cur_y_pos - end_y_pos) / num_y_rows + + self._draw_distributions( + cur_drawing, cur_x_pos, x_pos_change, cur_y_pos, y_pos_change, num_y_rows + ) + self._draw_legend(cur_drawing, 2.5 * inch, width) + + return _write(cur_drawing, output_file, self.output_format) + + def _draw_title(self, cur_drawing, title, width, height): + """Add the title of the figure to the drawing (PRIVATE).""" + title_string = String(width / 2, height - inch, title) + title_string.fontName = "Helvetica-Bold" + title_string.fontSize = self.title_size + title_string.textAnchor = "middle" + + cur_drawing.add(title_string) + + def _draw_distributions( + self, + cur_drawing, + start_x_pos, + x_pos_change, + start_y_pos, + y_pos_change, + num_y_drawings, + ): + """Draw all of the distributions on the page (PRIVATE). + + Arguments: + - cur_drawing - The drawing we are working with. + - start_x_pos - The x position on the page to start drawing at. + - x_pos_change - The change in x position between each figure. + - start_y_pos - The y position on the page to start drawing at. + - y_pos_change - The change in y position between each figure. + - num_y_drawings - The number of drawings we'll have in the y + (up/down) direction. + + """ + for y_drawing in range(int(num_y_drawings)): + # if we are on the last y position, we may not be able + # to fill all of the x columns + if (y_drawing + 1) * self.number_of_columns > len(self.distributions): + num_x_drawings = ( + len(self.distributions) - y_drawing * self.number_of_columns + ) + else: + num_x_drawings = self.number_of_columns + for x_drawing in range(num_x_drawings): + dist_num = y_drawing * self.number_of_columns + x_drawing + cur_distribution = self.distributions[dist_num] + + # find the x and y boundaries of the distribution + x_pos = start_x_pos + x_drawing * x_pos_change + end_x_pos = x_pos + x_pos_change + end_y_pos = start_y_pos - y_drawing * y_pos_change + y_pos = end_y_pos - y_pos_change + + # draw the distribution + cur_distribution.draw(cur_drawing, x_pos, y_pos, end_x_pos, end_y_pos) + + def _draw_legend(self, cur_drawing, start_y, width): + """Add a legend to the figure (PRIVATE). + + Subclasses can implement to provide a specialized legend. + """ + pass + + +class BarChartDistribution: + """Display the distribution of values as a bunch of bars.""" + + def __init__(self, display_info=None): + """Initialize a Bar Chart display of distribution info. + + Attributes: + - display_info - the information to be displayed in the distribution. + This should be ordered as a list of lists, where each internal list + is a data set to display in the bar chart. + + """ + if display_info is None: + display_info = [] + self.display_info = display_info + + self.x_axis_title = "" + self.y_axis_title = "" + self.chart_title = "" + self.chart_title_size = 10 + + self.padding_percent = 0.15 + + def draw(self, cur_drawing, start_x, start_y, end_x, end_y): + """Draw a bar chart with the info in the specified range.""" + bar_chart = VerticalBarChart() + if self.chart_title: + self._draw_title( + cur_drawing, self.chart_title, start_x, start_y, end_x, end_y + ) + # set the position of the bar chart + x_start, x_end, y_start, y_end = self._determine_position( + start_x, start_y, end_x, end_y + ) + + bar_chart.x = x_start + bar_chart.y = y_start + bar_chart.width = abs(x_start - x_end) + bar_chart.height = abs(y_start - y_end) + + # set the information in the bar chart + bar_chart.data = self.display_info + bar_chart.valueAxis.valueMin = min(self.display_info[0]) + bar_chart.valueAxis.valueMax = max(self.display_info[0]) + for data_set in self.display_info[1:]: + if min(data_set) < bar_chart.valueAxis.valueMin: + bar_chart.valueAxis.valueMin = min(data_set) + if max(data_set) > bar_chart.valueAxis.valueMax: + bar_chart.valueAxis.valueMax = max(data_set) + + # set other formatting options + if len(self.display_info) == 1: + bar_chart.groupSpacing = 0 + style = TypedPropertyCollection(BarChartProperties) + style.strokeWidth = 0 + style.strokeColor = colors.green + style[0].fillColor = colors.green + + bar_chart.bars = style + + # set the labels + # XXX labels don't work yet + # bar_chart.valueAxis.title = self.x_axis_title + # bar_chart.categoryAxis.title = self.y_axis_title + + cur_drawing.add(bar_chart) + + def _draw_title(self, cur_drawing, title, start_x, start_y, end_x, end_y): + """Add the title of the figure to the drawing (PRIVATE).""" + x_center = start_x + (end_x - start_x) / 2 + y_pos = end_y + (self.padding_percent * (start_y - end_y)) / 2 + title_string = String(x_center, y_pos, title) + title_string.fontName = "Helvetica-Bold" + title_string.fontSize = self.chart_title_size + title_string.textAnchor = "middle" + + cur_drawing.add(title_string) + + def _determine_position(self, start_x, start_y, end_x, end_y): + """Calculate the position of the chart with blank space (PRIVATE). + + This uses some padding around the chart, and takes into account + whether the chart has a title. It returns 4 values, which are, + in order, the x_start, x_end, y_start and y_end of the chart + itself. + """ + x_padding = self.padding_percent * (end_x - start_x) + y_padding = self.padding_percent * (start_y - end_y) + + new_x_start = start_x + x_padding + new_x_end = end_x - x_padding + + if self.chart_title: + new_y_start = start_y - y_padding - self.chart_title_size + else: + new_y_start = start_y - y_padding + + new_y_end = end_y + y_padding + + return new_x_start, new_x_end, new_y_start, new_y_end + + +class LineDistribution: + """Display the distribution of values as connected lines. + + This distribution displays the change in values across the object as + lines. This also allows multiple distributions to be displayed on a + single graph. + """ + + def __init__(self): + """Initialize the class.""" + pass + + def draw(self, cur_drawing, start_x, start_y, end_x, end_y): + """Draw a line distribution into the current drawing.""" + pass diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_AbstractDrawer.py b/code/lib/Bio/Graphics/GenomeDiagram/_AbstractDrawer.py new file mode 100644 index 0000000..4e97e36 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_AbstractDrawer.py @@ -0,0 +1,565 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# Revisions copyright 2008-2017 by Peter Cock. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ + +"""AbstractDrawer module (considered to be a private module, the API may change!). + +Provides: + - AbstractDrawer - Superclass for methods common to the Drawer objects + - page_sizes - Method that returns a ReportLab pagesize when passed + a valid ISO size + - draw_box - Method that returns a closed path object when passed + the proper co-ordinates. For HORIZONTAL boxes only. + - angle2trig - Method that returns a tuple of values that are the + vector for rotating a point through a passed angle, + about an origin + - intermediate_points - Method that returns a list of values intermediate + between the points in a passed dataset + +For drawing capabilities, this module uses reportlab to draw and write +the diagram: http://www.reportlab.com + +For dealing with biological information, the package expects Biopython objects +like SeqFeatures. +""" + +# ReportLab imports + +from reportlab.lib import pagesizes +from reportlab.lib import colors +from reportlab.graphics.shapes import Polygon + +from math import pi, sin, cos +from itertools import islice + +################################################################################ +# METHODS +################################################################################ + + +# Utility method to translate strings to ISO page sizes +def page_sizes(size): + """Convert size string into a Reportlab pagesize. + + Arguments: + - size - A string representing a standard page size, eg 'A4' or 'LETTER' + + """ + sizes = { # ReportLab pagesizes, keyed by ISO string + "A0": pagesizes.A0, + "A1": pagesizes.A1, + "A2": pagesizes.A2, + "A3": pagesizes.A3, + "A4": pagesizes.A4, + "A5": pagesizes.A5, + "A6": pagesizes.A6, + "B0": pagesizes.B0, + "B1": pagesizes.B1, + "B2": pagesizes.B2, + "B3": pagesizes.B3, + "B4": pagesizes.B4, + "B5": pagesizes.B5, + "B6": pagesizes.B6, + "ELEVENSEVENTEEN": pagesizes.ELEVENSEVENTEEN, + "LEGAL": pagesizes.LEGAL, + "LETTER": pagesizes.LETTER, + } + try: + return sizes[size] + except KeyError: + raise ValueError("%s not in list of page sizes" % size) from None + + +def _stroke_and_fill_colors(color, border): + """Deal with border and fill colors (PRIVATE).""" + if not isinstance(color, colors.Color): + raise ValueError("Invalid color %r" % color) + + if color == colors.white and border is None: + # Force black border on white boxes with undefined border + strokecolor = colors.black + elif border is None: + strokecolor = color # use fill color + elif border: + if not isinstance(border, colors.Color): + raise ValueError("Invalid border color %r" % border) + strokecolor = border + else: + # e.g. False + strokecolor = None + + return strokecolor, color + + +def draw_box( + point1, point2, color=colors.lightgreen, border=None, colour=None, **kwargs +): + """Draw a box. + + Arguments: + - point1, point2 - coordinates for opposite corners of the box + (x,y tuples) + - color /colour - The color for the box (colour takes priority + over color) + - border - Border color for the box + + Returns a closed path object, beginning at (x1,y1) going round + the four points in order, and filling with the passed color. + """ + x1, y1 = point1 + x2, y2 = point2 + + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + del colour + + strokecolor, color = _stroke_and_fill_colors(color, border) + + x1, y1, x2, y2 = min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2) + return Polygon( + [x1, y1, x2, y1, x2, y2, x1, y2], + strokeColor=strokecolor, + fillColor=color, + strokewidth=0, + **kwargs + ) + + +def draw_cut_corner_box( + point1, point2, corner=0.5, color=colors.lightgreen, border=None, **kwargs +): + """Draw a box with the corners cut off.""" + x1, y1 = point1 + x2, y2 = point2 + + if not corner: + return draw_box(point1, point2, color, border) + elif corner < 0: + raise ValueError("Arrow head length ratio should be positive") + + strokecolor, color = _stroke_and_fill_colors(color, border) + + boxheight = y2 - y1 + boxwidth = x2 - x1 + x_corner = min(boxheight * 0.5 * corner, boxwidth * 0.5) + y_corner = min(boxheight * 0.5 * corner, boxheight * 0.5) + + points = [ + x1, + y1 + y_corner, + x1, + y2 - y_corner, + x1 + x_corner, + y2, + x2 - x_corner, + y2, + x2, + y2 - y_corner, + x2, + y1 + y_corner, + x2 - x_corner, + y1, + x1 + x_corner, + y1, + ] + return Polygon( + deduplicate(points), + strokeColor=strokecolor, + strokeWidth=1, + strokeLineJoin=1, # 1=round + fillColor=color, + **kwargs + ) + + +def draw_polygon( + list_of_points, color=colors.lightgreen, border=None, colour=None, **kwargs +): + """Draw polygon. + + Arguments: + - list_of_point - list of (x,y) tuples for the corner coordinates + - color / colour - The color for the box + + Returns a closed path object, beginning at (x1,y1) going round + the four points in order, and filling with the passed colour. + + """ + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + del colour + + strokecolor, color = _stroke_and_fill_colors(color, border) + + xy_list = [] + for (x, y) in list_of_points: + xy_list.append(x) + xy_list.append(y) + + return Polygon( + deduplicate(xy_list), + strokeColor=strokecolor, + fillColor=color, + strokewidth=0, + **kwargs + ) + + +def draw_arrow( + point1, + point2, + color=colors.lightgreen, + border=None, + shaft_height_ratio=0.4, + head_length_ratio=0.5, + orientation="right", + colour=None, + **kwargs +): + """Draw an arrow. + + Returns a closed path object representing an arrow enclosed by the + box with corners at {point1=(x1,y1), point2=(x2,y2)}, a shaft height + given by shaft_height_ratio (relative to box height), a head length + given by head_length_ratio (also relative to box height), and + an orientation that may be 'left' or 'right'. + """ + x1, y1 = point1 + x2, y2 = point2 + + if shaft_height_ratio < 0 or 1 < shaft_height_ratio: + raise ValueError("Arrow shaft height ratio should be in range 0 to 1") + if head_length_ratio < 0: + raise ValueError("Arrow head length ratio should be positive") + + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + del colour + + strokecolor, color = _stroke_and_fill_colors(color, border) + + # Depending on the orientation, we define the bottom left (x1, y1) and + # top right (x2, y2) coordinates differently, but still draw the box + # using the same relative co-ordinates: + xmin, ymin = min(x1, x2), min(y1, y2) + xmax, ymax = max(x1, x2), max(y1, y2) + if orientation == "right": + x1, x2, y1, y2 = xmin, xmax, ymin, ymax + elif orientation == "left": + x1, x2, y1, y2 = xmax, xmin, ymin, ymax + else: + raise ValueError( + "Invalid orientation %r, should be 'left' or 'right'" % orientation + ) + + # We define boxheight and boxwidth accordingly, and calculate the shaft + # height from these. We also ensure that the maximum head length is + # the width of the box enclosure + boxheight = y2 - y1 + boxwidth = x2 - x1 + shaftheight = boxheight * shaft_height_ratio + headlength = min(abs(boxheight) * head_length_ratio, abs(boxwidth)) + if boxwidth < 0: + headlength *= -1 # reverse it + + shafttop = 0.5 * (boxheight + shaftheight) + shaftbase = boxheight - shafttop + headbase = boxwidth - headlength + midheight = 0.5 * boxheight + + points = [ + x1, + y1 + shafttop, + x1 + headbase, + y1 + shafttop, + x1 + headbase, + y2, + x2, + y1 + midheight, + x1 + headbase, + y1, + x1 + headbase, + y1 + shaftbase, + x1, + y1 + shaftbase, + ] + + return Polygon( + deduplicate(points), + strokeColor=strokecolor, + # strokeWidth=max(1, int(boxheight/40.)), + strokeWidth=1, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + fillColor=color, + **kwargs + ) + + +def deduplicate(points): + """Remove adjacent duplicate points. + + This is important for use with the Polygon class since reportlab has a + bug with duplicate points. + + Arguments: + - points - list of points [x1, y1, x2, y2,...] + + Returns a list in the same format with consecutive duplicates removed + """ + assert len(points) % 2 == 0 + if len(points) < 2: + return points + newpoints = points[0:2] + for x, y in zip(islice(points, 2, None, 2), islice(points, 3, None, 2)): + if x != newpoints[-2] or y != newpoints[-1]: + newpoints.append(x) + newpoints.append(y) + return newpoints + + +def angle2trig(theta): + """Convert angle to a reportlab ready tuple. + + Arguments: + - theta - Angle in degrees, counter clockwise from horizontal + + Returns a representation of the passed angle in a format suitable + for ReportLab rotations (i.e. cos(theta), sin(theta), -sin(theta), + cos(theta) tuple) + """ + c = cos(theta * pi / 180) + s = sin(theta * pi / 180) + return (c, s, -s, c) # Vector for rotating point around an origin + + +def intermediate_points(start, end, graph_data): + """Generate intermediate points describing provided graph data.. + + Returns a list of (start, end, value) tuples describing the passed + graph data as 'bins' between position midpoints. + """ + newdata = [] # data in form (X0, X1, val) + # add first block + newdata.append( + ( + start, + graph_data[0][0] + (graph_data[1][0] - graph_data[0][0]) / 2.0, + graph_data[0][1], + ) + ) + # add middle set + for index in range(1, len(graph_data) - 1): + lastxval, lastyval = graph_data[index - 1] + xval, yval = graph_data[index] + nextxval, nextyval = graph_data[index + 1] + newdata.append( + (lastxval + (xval - lastxval) / 2.0, xval + (nextxval - xval) / 2.0, yval) + ) + # add last block + newdata.append((xval + (nextxval - xval) / 2.0, end, graph_data[-1][1])) + return newdata + + +################################################################################ +# CLASSES +################################################################################ + + +class AbstractDrawer: + """Abstract Drawer. + + Attributes: + - tracklines Boolean for whether to draw lines delineating tracks + - pagesize Tuple describing the size of the page in pixels + - x0 Float X co-ord for leftmost point of drawable area + - xlim Float X co-ord for rightmost point of drawable area + - y0 Float Y co-ord for lowest point of drawable area + - ylim Float Y co-ord for topmost point of drawable area + - pagewidth Float pixel width of drawable area + - pageheight Float pixel height of drawable area + - xcenter Float X co-ord of center of drawable area + - ycenter Float Y co-ord of center of drawable area + - start Int, base to start drawing from + - end Int, base to stop drawing at + - length Size of sequence to be drawn + - cross_track_links List of tuples each with four entries (track A, + feature A, track B, feature B) to be linked. + + """ + + def __init__( + self, + parent, + pagesize="A3", + orientation="landscape", + x=0.05, + y=0.05, + xl=None, + xr=None, + yt=None, + yb=None, + start=None, + end=None, + tracklines=0, + cross_track_links=None, + ): + """Create the object. + + Arguments: + - parent Diagram object containing the data that the drawer draws + - pagesize String describing the ISO size of the image, or a tuple + of pixels + - orientation String describing the required orientation of the + final drawing ('landscape' or 'portrait') + - x Float (0->1) describing the relative size of the X + margins to the page + - y Float (0->1) describing the relative size of the Y + margins to the page + - xl Float (0->1) describing the relative size of the left X + margin to the page (overrides x) + - xr Float (0->1) describing the relative size of the right X + margin to the page (overrides x) + - yt Float (0->1) describing the relative size of the top Y + margin to the page (overrides y) + - yb Float (0->1) describing the relative size of the lower Y + margin to the page (overrides y) + - start Int, the position to begin drawing the diagram at + - end Int, the position to stop drawing the diagram at + - tracklines Boolean flag to show (or not) lines delineating tracks + on the diagram + - cross_track_links List of tuples each with four entries (track A, + feature A, track B, feature B) to be linked. + + """ + self._parent = parent # The calling Diagram object + + # Perform 'administrative' tasks of setting up the page + self.set_page_size(pagesize, orientation) # Set drawing size + self.set_margins(x, y, xl, xr, yt, yb) # Set page margins + self.set_bounds(start, end) # Set limits on what will be drawn + self.tracklines = tracklines # Set flags + if cross_track_links is None: + cross_track_links = [] + else: + self.cross_track_links = cross_track_links + + def set_page_size(self, pagesize, orientation): + """Set page size of the drawing.. + + Arguments: + - pagesize Size of the output image, a tuple of pixels (width, + height, or a string in the reportlab.lib.pagesizes + set of ISO sizes. + - orientation String: 'landscape' or 'portrait' + + """ + if isinstance(pagesize, str): # A string, so translate + pagesize = page_sizes(pagesize) + elif isinstance(pagesize, tuple): # A tuple, so don't translate + pass + else: + raise ValueError("Page size %s not recognised" % pagesize) + shortside, longside = min(pagesize), max(pagesize) + + orientation = orientation.lower() + if orientation not in ("landscape", "portrait"): + raise ValueError("Orientation %s not recognised" % orientation) + if orientation == "landscape": + self.pagesize = (longside, shortside) + else: + self.pagesize = (shortside, longside) + + def set_margins(self, x, y, xl, xr, yt, yb): + """Set page margins. + + Arguments: + - x Float(0->1), Absolute X margin as % of page + - y Float(0->1), Absolute Y margin as % of page + - xl Float(0->1), Left X margin as % of page + - xr Float(0->1), Right X margin as % of page + - yt Float(0->1), Top Y margin as % of page + - yb Float(0->1), Bottom Y margin as % of page + + Set the page margins as proportions of the page 0->1, and also + set the page limits x0, y0 and xlim, ylim, and page center + xorigin, yorigin, as well as overall page width and height + """ + # Set left, right, top and bottom margins + xmargin_l = xl or x + xmargin_r = xr or x + ymargin_top = yt or y + ymargin_btm = yb or y + + # Set page limits, center and height/width + self.x0, self.y0 = self.pagesize[0] * xmargin_l, self.pagesize[1] * ymargin_btm + self.xlim, self.ylim = ( + self.pagesize[0] * (1 - xmargin_r), + self.pagesize[1] * (1 - ymargin_top), + ) + self.pagewidth = self.xlim - self.x0 + self.pageheight = self.ylim - self.y0 + self.xcenter, self.ycenter = ( + self.x0 + self.pagewidth / 2.0, + self.y0 + self.pageheight / 2.0, + ) + + def set_bounds(self, start, end): + """Set start and end points for the drawing as a whole. + + Arguments: + - start - The first base (or feature mark) to draw from + - end - The last base (or feature mark) to draw to + + """ + low, high = self._parent.range() # Extent of tracks + + if start is not None and end is not None and start > end: + start, end = end, start + + if start is None or start < 0: # Check validity of passed args and + start = 0 # default to 0 + if end is None or end < 0: + end = high + 1 # default to track range top limit + + self.start, self.end = int(start), int(end) + self.length = self.end - self.start + 1 + + def is_in_bounds(self, value): + """Check if given value is within the region selected for drawing. + + Arguments: + - value - A base position + + """ + if value >= self.start and value <= self.end: + return 1 + return 0 + + def __len__(self): + """Return the length of the region to be drawn.""" + return self.length + + def _current_track_start_end(self): + track = self._parent[self.current_track_level] + if track.start is None: + start = self.start + else: + start = max(self.start, track.start) + if track.end is None: + end = self.end + else: + end = min(self.end, track.end) + return start, end diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_CircularDrawer.py b/code/lib/Bio/Graphics/GenomeDiagram/_CircularDrawer.py new file mode 100644 index 0000000..b090fd9 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_CircularDrawer.py @@ -0,0 +1,1725 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# Revisions copyright 2008-2017 by Peter Cock. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ + +"""CircularDrawer module for GenomeDiagram.""" + +# ReportLab imports + +from reportlab.graphics.shapes import Drawing, String, Group, Line, Circle, Polygon +from reportlab.lib import colors +from reportlab.graphics.shapes import ArcPath + +# GenomeDiagram imports +from ._AbstractDrawer import AbstractDrawer, draw_polygon, intermediate_points +from ._AbstractDrawer import _stroke_and_fill_colors +from ._FeatureSet import FeatureSet +from ._GraphSet import GraphSet + +from math import pi, cos, sin + + +class CircularDrawer(AbstractDrawer): + """Object for drawing circular diagrams. + + Attributes: + - tracklines Boolean for whether to draw lines dilineating tracks + - pagesize Tuple describing the size of the page in pixels + - x0 Float X co-ord for leftmost point of drawable area + - xlim Float X co-ord for rightmost point of drawable area + - y0 Float Y co-ord for lowest point of drawable area + - ylim Float Y co-ord for topmost point of drawable area + - pagewidth Float pixel width of drawable area + - pageheight Float pixel height of drawable area + - xcenter Float X co-ord of center of drawable area + - ycenter Float Y co-ord of center of drawable area + - start Int, base to start drawing from + - end Int, base to stop drawing at + - length Size of sequence to be drawn + - track_size Float (0->1) the proportion of the track height to draw in + - drawing Drawing canvas + - drawn_tracks List of ints denoting which tracks are to be drawn + - current_track_level Int denoting which track is currently being drawn + - track_offsets Dictionary of number of pixels that each track top, + center and bottom is offset from the base of a fragment, keyed by track + - sweep Float (0->1) the proportion of the circle circumference to + use for the diagram + - cross_track_links List of tuples each with four entries (track A, + feature A, track B, feature B) to be linked. + + """ + + def __init__( + self, + parent=None, + pagesize="A3", + orientation="landscape", + x=0.05, + y=0.05, + xl=None, + xr=None, + yt=None, + yb=None, + start=None, + end=None, + tracklines=0, + track_size=0.75, + circular=1, + circle_core=0.0, + cross_track_links=None, + ): + """Create CircularDrawer object. + + Arguments: + - parent Diagram object containing the data that the drawer + draws + - pagesize String describing the ISO size of the image, or a tuple + of pixels + - orientation String describing the required orientation of the + final drawing ('landscape' or 'portrait') + - x Float (0->1) describing the relative size of the X + margins to the page + - y Float (0->1) describing the relative size of the Y + margins to the page + - xl Float (0->1) describing the relative size of the left X + margin to the page (overrides x) + - xl Float (0->1) describing the relative size of the left X + margin to the page (overrides x) + - xr Float (0->1) describing the relative size of the right X + margin to the page (overrides x) + - yt Float (0->1) describing the relative size of the top Y + margin to the page (overrides y) + - yb Float (0->1) describing the relative size of the lower Y + margin to the page (overrides y) + - start Int, the position to begin drawing the diagram at + - end Int, the position to stop drawing the diagram at + - tracklines Boolean flag to show (or not) lines delineating tracks + on the diagram + - track_size The proportion of the available track height that + should be taken up in drawing + - circular Boolean flaw to show whether the passed sequence is + circular or not + - circle_core The proportion of the available radius to leave + empty at the center of a circular diagram (0 to 1). + - cross_track_links List of tuples each with four entries (track A, + feature A, track B, feature B) to be linked. + + """ + # Use the superclass' instantiation method + AbstractDrawer.__init__( + self, + parent, + pagesize, + orientation, + x, + y, + xl, + xr, + yt, + yb, + start, + end, + tracklines, + cross_track_links, + ) + + # Useful measurements on the page + self.track_size = track_size + self.circle_core = circle_core + # Determine proportion of circumference around which information will be drawn + if not circular: + self.sweep = 0.9 + else: + self.sweep = 1.0 + + def set_track_heights(self): + """Initialize track heights. + + Since tracks may not be of identical heights, the bottom and top + radius for each track is stored in a dictionary - self.track_radii, + keyed by track number + """ + bot_track = min(min(self.drawn_tracks), 1) + top_track = max(self.drawn_tracks) # The 'highest' track to draw + + trackunit_sum = 0 # Total number of 'units' taken up by all tracks + trackunits = {} # Start and & units for each track keyed by track number + heightholder = 0 # placeholder variable + for track in range(bot_track, top_track + 1): # track numbers to 'draw' + try: + trackheight = self._parent[track].height # Get track height + except Exception: # TODO: ValueError? IndexError? + trackheight = 1 + trackunit_sum += trackheight # increment total track unit height + trackunits[track] = (heightholder, heightholder + trackheight) + heightholder += trackheight # move to next height + + max_radius = 0.5 * min(self.pagewidth, self.pageheight) + trackunit_height = max_radius * (1 - self.circle_core) / trackunit_sum + track_core = max_radius * self.circle_core + + # Calculate top and bottom radii for each track + self.track_radii = {} # The inner, outer and center radii for each track + track_crop = ( + trackunit_height * (1 - self.track_size) / 2.0 + ) # 'step back' in pixels + for track in trackunits: + top = trackunits[track][1] * trackunit_height - track_crop + track_core + btm = trackunits[track][0] * trackunit_height + track_crop + track_core + ctr = btm + (top - btm) / 2.0 + self.track_radii[track] = (btm, ctr, top) + + def draw(self): + """Draw a circular diagram of the stored data.""" + # Instantiate the drawing canvas + self.drawing = Drawing(self.pagesize[0], self.pagesize[1]) + + feature_elements = [] # holds feature elements + feature_labels = [] # holds feature labels + greytrack_bgs = [] # holds track background + greytrack_labels = [] # holds track foreground labels + scale_axes = [] # holds scale axes + scale_labels = [] # holds scale axis labels + + # Get tracks to be drawn and set track sizes + self.drawn_tracks = self._parent.get_drawn_levels() + self.set_track_heights() + + # Go through each track in the parent (if it is to be drawn) one by + # one and collate the data as drawing elements + for track_level in self._parent.get_drawn_levels(): + self.current_track_level = track_level + track = self._parent[track_level] + gbgs, glabels = self.draw_greytrack(track) # Greytracks + greytrack_bgs.append(gbgs) + greytrack_labels.append(glabels) + features, flabels = self.draw_track(track) # Features and graphs + feature_elements.append(features) + feature_labels.append(flabels) + if track.scale: + axes, slabels = self.draw_scale(track) # Scale axes + scale_axes.append(axes) + scale_labels.append(slabels) + + feature_cross_links = [] + for cross_link_obj in self.cross_track_links: + cross_link_elements = self.draw_cross_link(cross_link_obj) + if cross_link_elements: + feature_cross_links.append(cross_link_elements) + + # Groups listed in order of addition to page (from back to front) + # Draw track backgrounds + # Draw feature cross track links + # Draw features and graphs + # Draw scale axes + # Draw scale labels + # Draw feature labels + # Draw track labels + element_groups = [ + greytrack_bgs, + feature_cross_links, + feature_elements, + scale_axes, + scale_labels, + feature_labels, + greytrack_labels, + ] + for element_group in element_groups: + for element_list in element_group: + [self.drawing.add(element) for element in element_list] + + if self.tracklines: + # Draw test tracks over top of diagram + self.draw_test_tracks() + + def draw_track(self, track): + """Return list of track elements and list of track labels.""" + track_elements = [] # Holds elements for features and graphs + track_labels = [] # Holds labels for features and graphs + + # Distribution dictionary for dealing with different set types + set_methods = {FeatureSet: self.draw_feature_set, GraphSet: self.draw_graph_set} + + for set in track.get_sets(): # Draw the feature or graph sets + elements, labels = set_methods[set.__class__](set) + track_elements += elements + track_labels += labels + return track_elements, track_labels + + def draw_feature_set(self, set): + """Return list of feature elements and list of labels for them.""" + # print('draw feature set') + feature_elements = [] # Holds diagram elements belonging to the features + label_elements = [] # Holds diagram elements belonging to feature labels + + # Collect all the elements for the feature set + for feature in set.get_features(): + if self.is_in_bounds(feature.start) or self.is_in_bounds(feature.end): + features, labels = self.draw_feature(feature) + feature_elements += features + label_elements += labels + + return feature_elements, label_elements + + def draw_feature(self, feature): + """Return list of feature elements and list of labels for them.""" + feature_elements = [] # Holds drawable elements for a single feature + label_elements = [] # Holds labels for a single feature + + if feature.hide: # Don't show feature: return early + return feature_elements, label_elements + + start, end = self._current_track_start_end() + # A single feature may be split into subfeatures, so loop over them + for locstart, locend in feature.locations: + if locend < start: + continue + locstart = max(locstart, start) + if end < locstart: + continue + locend = min(locend, end) + # Get sigil for the feature/ each subfeature + feature_sigil, label = self.get_feature_sigil(feature, locstart, locend) + feature_elements.append(feature_sigil) + if label is not None: # If there's a label + label_elements.append(label) + + return feature_elements, label_elements + + def get_feature_sigil(self, feature, locstart, locend, **kwargs): + """Return graphics for feature, and any required label for it. + + Arguments: + - feature Feature object + - locstart The start position of the feature + - locend The end position of the feature + + """ + # Establish the co-ordinates for the sigil + btm, ctr, top = self.track_radii[self.current_track_level] + + startangle, startcos, startsin = self.canvas_angle(locstart) + endangle, endcos, endsin = self.canvas_angle(locend) + midangle, midcos, midsin = self.canvas_angle(float(locend + locstart) / 2) + + # Distribution dictionary for various ways of drawing the feature + # Each method takes the inner and outer radii, the start and end angle + # subtended at the diagram center, and the color as arguments + draw_methods = { + "BOX": self._draw_sigil_box, + "OCTO": self._draw_sigil_cut_corner_box, + "JAGGY": self._draw_sigil_jaggy, + "ARROW": self._draw_sigil_arrow, + "BIGARROW": self._draw_sigil_big_arrow, + } + + # Get sigil for the feature, location dependent on the feature strand + method = draw_methods[feature.sigil] + kwargs["head_length_ratio"] = feature.arrowhead_length + kwargs["shaft_height_ratio"] = feature.arrowshaft_height + + # Support for clickable links... needs ReportLab 2.4 or later + # which added support for links in SVG output. + if hasattr(feature, "url"): + kwargs["hrefURL"] = feature.url + kwargs["hrefTitle"] = feature.name + + sigil = method( + btm, + ctr, + top, + startangle, + endangle, + feature.strand, + color=feature.color, + border=feature.border, + **kwargs + ) + + if feature.label: # Feature needs a label + # The spaces are a hack to force a little space between the label + # and the edge of the feature + label = String( + 0, + 0, + " %s " % feature.name.strip(), + fontName=feature.label_font, + fontSize=feature.label_size, + fillColor=feature.label_color, + ) + labelgroup = Group(label) + if feature.label_strand: + strand = feature.label_strand + else: + strand = feature.strand + if feature.label_position in ("start", "5'", "left"): + # Position the label at the feature's start + if strand != -1: + label_angle = startangle + 0.5 * pi # Make text radial + sinval, cosval = startsin, startcos + else: + label_angle = endangle + 0.5 * pi # Make text radial + sinval, cosval = endsin, endcos + elif feature.label_position in ("middle", "center", "centre"): + # Position the label at the feature's midpoint + label_angle = midangle + 0.5 * pi # Make text radial + sinval, cosval = midsin, midcos + elif feature.label_position in ("end", "3'", "right"): + # Position the label at the feature's end + if strand != -1: + label_angle = endangle + 0.5 * pi # Make text radial + sinval, cosval = endsin, endcos + else: + label_angle = startangle + 0.5 * pi # Make text radial + sinval, cosval = startsin, startcos + elif startangle < pi: + # Default to placing the label the bottom of the feature + # as drawn on the page, meaning feature end on left half + label_angle = endangle + 0.5 * pi # Make text radial + sinval, cosval = endsin, endcos + else: + # Default to placing the label on the bottom of the feature, + # which means the feature end when on right hand half + label_angle = startangle + 0.5 * pi # Make text radial + sinval, cosval = startsin, startcos + if strand != -1: + # Feature label on top + radius = top + if startangle < pi: # Turn text round + label_angle -= pi + else: + labelgroup.contents[0].textAnchor = "end" + else: + # Feature label on bottom + radius = btm + if startangle < pi: # Turn text round and anchor end + label_angle -= pi + labelgroup.contents[0].textAnchor = "end" + x_pos = self.xcenter + radius * sinval + y_pos = self.ycenter + radius * cosval + coslabel = cos(label_angle) + sinlabel = sin(label_angle) + labelgroup.transform = ( + coslabel, + -sinlabel, + sinlabel, + coslabel, + x_pos, + y_pos, + ) + else: + # No label required + labelgroup = None + # if locstart > locend: + # print(locstart, locend, feature.strand, sigil, feature.name) + # print(locstart, locend, feature.name) + return sigil, labelgroup + + def draw_cross_link(self, cross_link): + """Draw a cross-link between features.""" + startA = cross_link.startA + startB = cross_link.startB + endA = cross_link.endA + endB = cross_link.endB + + if not self.is_in_bounds(startA) and not self.is_in_bounds(endA): + return None + if not self.is_in_bounds(startB) and not self.is_in_bounds(endB): + return None + + if startA < self.start: + startA = self.start + if startB < self.start: + startB = self.start + if self.end < endA: + endA = self.end + if self.end < endB: + endB = self.end + + trackobjA = cross_link._trackA(list(self._parent.tracks.values())) + trackobjB = cross_link._trackB(list(self._parent.tracks.values())) + assert trackobjA is not None + assert trackobjB is not None + if trackobjA == trackobjB: + raise NotImplementedError() + + if trackobjA.start is not None: + if endA < trackobjA.start: + return + startA = max(startA, trackobjA.start) + if trackobjA.end is not None: + if trackobjA.end < startA: + return + endA = min(endA, trackobjA.end) + if trackobjB.start is not None: + if endB < trackobjB.start: + return + startB = max(startB, trackobjB.start) + if trackobjB.end is not None: + if trackobjB.end < startB: + return + endB = min(endB, trackobjB.end) + + for track_level in self._parent.get_drawn_levels(): + track = self._parent[track_level] + if track == trackobjA: + trackA = track_level + if track == trackobjB: + trackB = track_level + if trackA == trackB: + raise NotImplementedError() + + startangleA, startcosA, startsinA = self.canvas_angle(startA) + startangleB, startcosB, startsinB = self.canvas_angle(startB) + endangleA, endcosA, endsinA = self.canvas_angle(endA) + endangleB, endcosB, endsinB = self.canvas_angle(endB) + + btmA, ctrA, topA = self.track_radii[trackA] + btmB, ctrB, topB = self.track_radii[trackB] + + if ctrA < ctrB: + return [ + self._draw_arc_poly( + topA, + btmB, + startangleA, + endangleA, + startangleB, + endangleB, + cross_link.color, + cross_link.border, + cross_link.flip, + ) + ] + else: + return [ + self._draw_arc_poly( + btmA, + topB, + startangleA, + endangleA, + startangleB, + endangleB, + cross_link.color, + cross_link.border, + cross_link.flip, + ) + ] + + def draw_graph_set(self, set): + """Return list of graph elements and list of their labels. + + Arguments: + - set GraphSet object + + """ + # print('draw graph set') + elements = [] # Holds graph elements + + # Distribution dictionary for how to draw the graph + style_methods = { + "line": self.draw_line_graph, + "heat": self.draw_heat_graph, + "bar": self.draw_bar_graph, + } + + for graph in set.get_graphs(): + elements += style_methods[graph.style](graph) + + return elements, [] + + def draw_line_graph(self, graph): + """Return line graph as list of drawable elements. + + Arguments: + - graph GraphData object + + """ + line_elements = [] # holds drawable elements + + # Get graph data + data_quartiles = graph.quartiles() + minval, maxval = data_quartiles[0], data_quartiles[4] + btm, ctr, top = self.track_radii[self.current_track_level] + trackheight = 0.5 * (top - btm) + datarange = maxval - minval + if datarange == 0: + datarange = trackheight + + start, end = self._current_track_start_end() + data = graph[start:end] + + if not data: + return [] + + # midval is the value at which the x-axis is plotted, and is the + # central ring in the track + if graph.center is None: + midval = (maxval + minval) / 2.0 + else: + midval = graph.center + # Whichever is the greatest difference: max-midval or min-midval, is + # taken to specify the number of pixel units resolved along the + # y-axis + resolution = max((midval - minval), (maxval - midval)) + + # Start from first data point + pos, val = data[0] + lastangle, lastcos, lastsin = self.canvas_angle(pos) + # We calculate the track height + posheight = trackheight * (val - midval) / resolution + ctr + lastx = self.xcenter + posheight * lastsin # start xy coords + lasty = self.ycenter + posheight * lastcos + for pos, val in data: + posangle, poscos, possin = self.canvas_angle(pos) + posheight = trackheight * (val - midval) / resolution + ctr + x = self.xcenter + posheight * possin # next xy coords + y = self.ycenter + posheight * poscos + line_elements.append( + Line( + lastx, + lasty, + x, + y, + strokeColor=graph.poscolor, + strokeWidth=graph.linewidth, + ) + ) + lastx, lasty, = x, y + return line_elements + + def draw_bar_graph(self, graph): + """Return list of drawable elements for a bar graph. + + Arguments: + - graph Graph object + + """ + # At each point contained in the graph data, we draw a vertical bar + # from the track center to the height of the datapoint value (positive + # values go up in one color, negative go down in the alternative + # color). + bar_elements = [] + + # Set the number of pixels per unit for the data + data_quartiles = graph.quartiles() + minval, maxval = data_quartiles[0], data_quartiles[4] + btm, ctr, top = self.track_radii[self.current_track_level] + trackheight = 0.5 * (top - btm) + datarange = maxval - minval + if datarange == 0: + datarange = trackheight + data = graph[self.start : self.end] + # midval is the value at which the x-axis is plotted, and is the + # central ring in the track + if graph.center is None: + midval = (maxval + minval) / 2.0 + else: + midval = graph.center + + # Convert data into 'binned' blocks, covering half the distance to the + # next data point on either side, accounting for the ends of fragments + # and tracks + start, end = self._current_track_start_end() + data = intermediate_points(start, end, graph[start:end]) + + if not data: + return [] + + # Whichever is the greatest difference: max-midval or min-midval, is + # taken to specify the number of pixel units resolved along the + # y-axis + resolution = max((midval - minval), (maxval - midval)) + if resolution == 0: + resolution = trackheight + + # Create elements for the bar graph based on newdata + for pos0, pos1, val in data: + pos0angle, pos0cos, pos0sin = self.canvas_angle(pos0) + pos1angle, pos1cos, pos1sin = self.canvas_angle(pos1) + + barval = trackheight * (val - midval) / resolution + if barval >= 0: + barcolor = graph.poscolor + else: + barcolor = graph.negcolor + + # Draw bar + bar_elements.append( + self._draw_arc(ctr, ctr + barval, pos0angle, pos1angle, barcolor) + ) + return bar_elements + + def draw_heat_graph(self, graph): + """Return list of drawable elements for the heat graph. + + Arguments: + - graph Graph object + + """ + # At each point contained in the graph data, we draw a box that is the + # full height of the track, extending from the midpoint between the + # previous and current data points to the midpoint between the current + # and next data points + heat_elements = [] # holds drawable elements + + # Get graph data + data_quartiles = graph.quartiles() + minval, maxval = data_quartiles[0], data_quartiles[4] + midval = (maxval + minval) / 2.0 # mid is the value at the X-axis + btm, ctr, top = self.track_radii[self.current_track_level] + trackheight = top - btm + + start, end = self._current_track_start_end() + data = intermediate_points(start, end, graph[start:end]) + + # Create elements on the graph, indicating a large positive value by + # the graph's poscolor, and a large negative value by the graph's + # negcolor attributes + for pos0, pos1, val in data: + pos0angle, pos0cos, pos0sin = self.canvas_angle(pos0) + pos1angle, pos1cos, pos1sin = self.canvas_angle(pos1) + + # Calculate the heat color, based on the differential between + # the value and the median value + heat = colors.linearlyInterpolatedColor( + graph.poscolor, graph.negcolor, maxval, minval, val + ) + + # Draw heat box + heat_elements.append( + self._draw_arc(btm, top, pos0angle, pos1angle, heat, border=heat) + ) + return heat_elements + + def draw_scale(self, track): + """Return list of elements in the scale and list of their labels. + + Arguments: + - track Track object + + """ + scale_elements = [] # holds axes and ticks + scale_labels = [] # holds labels + + if not track.scale: + # no scale required, exit early + return [], [] + + # Get track locations + btm, ctr, top = self.track_radii[self.current_track_level] + trackheight = top - ctr + + # X-axis + start, end = self._current_track_start_end() + if track.start is not None or track.end is not None: + # Draw an arc, leaving out the wedge + p = ArcPath(strokeColor=track.scale_color, fillColor=None) + startangle, startcos, startsin = self.canvas_angle(start) + endangle, endcos, endsin = self.canvas_angle(end) + p.addArc( + self.xcenter, + self.ycenter, + ctr, + 90 - (endangle * 180 / pi), + 90 - (startangle * 180 / pi), + ) + scale_elements.append(p) + del p + # Y-axis start marker + x0, y0 = self.xcenter + btm * startsin, self.ycenter + btm * startcos + x1, y1 = self.xcenter + top * startsin, self.ycenter + top * startcos + scale_elements.append(Line(x0, y0, x1, y1, strokeColor=track.scale_color)) + # Y-axis end marker + x0, y0 = self.xcenter + btm * endsin, self.ycenter + btm * endcos + x1, y1 = self.xcenter + top * endsin, self.ycenter + top * endcos + scale_elements.append(Line(x0, y0, x1, y1, strokeColor=track.scale_color)) + elif self.sweep < 1: + # Draw an arc, leaving out the wedge + p = ArcPath(strokeColor=track.scale_color, fillColor=None) + # Note reportlab counts angles anti-clockwise from the horizontal + # (as in mathematics, e.g. complex numbers and polar coordinates) + # in degrees. + p.addArc( + self.xcenter, + self.ycenter, + ctr, + startangledegrees=90 - 360 * self.sweep, + endangledegrees=90, + ) + scale_elements.append(p) + del p + # Y-axis start marker + x0, y0 = self.xcenter, self.ycenter + btm + x1, y1 = self.xcenter, self.ycenter + top + scale_elements.append(Line(x0, y0, x1, y1, strokeColor=track.scale_color)) + # Y-axis end marker + alpha = 2 * pi * self.sweep + x0, y0 = self.xcenter + btm * sin(alpha), self.ycenter + btm * cos(alpha) + x1, y1 = self.xcenter + top * sin(alpha), self.ycenter + top * cos(alpha) + scale_elements.append(Line(x0, y0, x1, y1, strokeColor=track.scale_color)) + else: + # Draw a full circle + scale_elements.append( + Circle( + self.xcenter, + self.ycenter, + ctr, + strokeColor=track.scale_color, + fillColor=None, + ) + ) + + start, end = self._current_track_start_end() + if track.scale_ticks: # Ticks are required on the scale + # Draw large ticks + # I want the ticks to be consistently positioned relative to + # the start of the sequence (position 0), not relative to the + # current viewpoint (self.start and self.end) + + ticklen = track.scale_largeticks * trackheight + tickiterval = int(track.scale_largetick_interval) + # Note that we could just start the list of ticks using + # range(0,self.end,tickinterval) and the filter out the + # ones before self.start - but this seems wasteful. + # Using tickiterval * (self.start/tickiterval) is a shortcut. + for tickpos in range( + tickiterval * (self.start // tickiterval), int(self.end), tickiterval + ): + if tickpos <= start or end <= tickpos: + continue + tick, label = self.draw_tick( + tickpos, ctr, ticklen, track, track.scale_largetick_labels + ) + scale_elements.append(tick) + if label is not None: # If there's a label, add it + scale_labels.append(label) + # Draw small ticks + ticklen = track.scale_smallticks * trackheight + tickiterval = int(track.scale_smalltick_interval) + for tickpos in range( + tickiterval * (self.start // tickiterval), int(self.end), tickiterval + ): + if tickpos <= start or end <= tickpos: + continue + tick, label = self.draw_tick( + tickpos, ctr, ticklen, track, track.scale_smalltick_labels + ) + scale_elements.append(tick) + if label is not None: # If there's a label, add it + scale_labels.append(label) + + # Check to see if the track contains a graph - if it does, get the + # minimum and maximum values, and put them on the scale Y-axis + # at 60 degree intervals, ordering the labels by graph_id + startangle, startcos, startsin = self.canvas_angle(start) + endangle, endcos, endsin = self.canvas_angle(end) + if track.axis_labels: + for set in track.get_sets(): + if set.__class__ is GraphSet: + # Y-axis + for n in range(7): + angle = n * 1.0471975511965976 + if angle < startangle or endangle < angle: + continue + ticksin, tickcos = sin(angle), cos(angle) + x0, y0 = ( + self.xcenter + btm * ticksin, + self.ycenter + btm * tickcos, + ) + x1, y1 = ( + self.xcenter + top * ticksin, + self.ycenter + top * tickcos, + ) + scale_elements.append( + Line(x0, y0, x1, y1, strokeColor=track.scale_color) + ) + + graph_label_min = [] + graph_label_max = [] + graph_label_mid = [] + for graph in set.get_graphs(): + quartiles = graph.quartiles() + minval, maxval = quartiles[0], quartiles[4] + if graph.center is None: + midval = (maxval + minval) / 2.0 + graph_label_min.append("%.3f" % minval) + graph_label_max.append("%.3f" % maxval) + graph_label_mid.append("%.3f" % midval) + else: + diff = max( + (graph.center - minval), (maxval - graph.center) + ) + minval = graph.center - diff + maxval = graph.center + diff + midval = graph.center + graph_label_mid.append("%.3f" % midval) + graph_label_min.append("%.3f" % minval) + graph_label_max.append("%.3f" % maxval) + xmid, ymid = (x0 + x1) / 2.0, (y0 + y1) / 2.0 + for limit, x, y in [ + (graph_label_min, x0, y0), + (graph_label_max, x1, y1), + (graph_label_mid, xmid, ymid), + ]: + label = String( + 0, + 0, + ";".join(limit), + fontName=track.scale_font, + fontSize=track.scale_fontsize, + fillColor=track.scale_color, + ) + label.textAnchor = "middle" + labelgroup = Group(label) + labelgroup.transform = ( + tickcos, + -ticksin, + ticksin, + tickcos, + x, + y, + ) + scale_labels.append(labelgroup) + + return scale_elements, scale_labels + + def draw_tick(self, tickpos, ctr, ticklen, track, draw_label): + """Return drawing element for a tick on the scale. + + Arguments: + - tickpos Int, position of the tick on the sequence + - ctr Float, Y co-ord of the center of the track + - ticklen How long to draw the tick + - track Track, the track the tick is drawn on + - draw_label Boolean, write the tick label? + + """ + # Calculate tick co-ordinates + tickangle, tickcos, ticksin = self.canvas_angle(tickpos) + x0, y0 = self.xcenter + ctr * ticksin, self.ycenter + ctr * tickcos + x1, y1 = ( + self.xcenter + (ctr + ticklen) * ticksin, + self.ycenter + (ctr + ticklen) * tickcos, + ) + # Calculate height of text label so it can be offset on lower half + # of diagram + # LP: not used, as not all fonts have ascent_descent data in reportlab.pdfbase._fontdata + # label_offset = _fontdata.ascent_descent[track.scale_font][0]*\ + # track.scale_fontsize/1000. + tick = Line(x0, y0, x1, y1, strokeColor=track.scale_color) + if draw_label: + # Put tick position on as label + if track.scale_format == "SInt": + if tickpos >= 1000000: + tickstring = str(tickpos // 1000000) + " Mbp" + elif tickpos >= 1000: + tickstring = str(tickpos // 1000) + " Kbp" + else: + tickstring = str(tickpos) + else: + tickstring = str(tickpos) + label = String( + 0, + 0, + tickstring, # Make label string + fontName=track.scale_font, + fontSize=track.scale_fontsize, + fillColor=track.scale_color, + ) + if tickangle > pi: + label.textAnchor = "end" + # LP: This label_offset depends on ascent_descent data, which is not available for all + # fonts, so has been deprecated. + # if 0.5*pi < tickangle < 1.5*pi: + # y1 -= label_offset + labelgroup = Group(label) + labelgroup.transform = (1, 0, 0, 1, x1, y1) + else: + labelgroup = None + return tick, labelgroup + + def draw_test_tracks(self): + """Draw blue test tracks with grene line down their center.""" + # Add lines only for drawn tracks + for track in self.drawn_tracks: + btm, ctr, top = self.track_radii[track] + self.drawing.add( + Circle( + self.xcenter, + self.ycenter, + top, + strokeColor=colors.blue, + fillColor=None, + ) + ) # top line + self.drawing.add( + Circle( + self.xcenter, + self.ycenter, + ctr, + strokeColor=colors.green, + fillColor=None, + ) + ) # middle line + self.drawing.add( + Circle( + self.xcenter, + self.ycenter, + btm, + strokeColor=colors.blue, + fillColor=None, + ) + ) # bottom line + + def draw_greytrack(self, track): + """Drawing element for grey background to passed Track object.""" + greytrack_bgs = [] # Holds track backgrounds + greytrack_labels = [] # Holds track foreground labels + + if not track.greytrack: # No greytrack required, return early + return [], [] + + # Get track location + btm, ctr, top = self.track_radii[self.current_track_level] + + start, end = self._current_track_start_end() + startangle, startcos, startsin = self.canvas_angle(start) + endangle, endcos, endsin = self.canvas_angle(end) + + # Make background + if track.start is not None or track.end is not None: + # Draw an arc, leaving out the wedge + p = ArcPath(strokeColor=track.scale_color, fillColor=None) + greytrack_bgs.append( + self._draw_arc( + btm, top, startangle, endangle, colors.Color(0.96, 0.96, 0.96) + ) + ) + elif self.sweep < 1: + # Make a partial circle, a large arc box + # This method assumes the correct center for us. + greytrack_bgs.append( + self._draw_arc( + btm, top, 0, 2 * pi * self.sweep, colors.Color(0.96, 0.96, 0.96) + ) + ) + else: + # Make a full circle (using a VERY thick linewidth) + greytrack_bgs.append( + Circle( + self.xcenter, + self.ycenter, + ctr, + strokeColor=colors.Color(0.96, 0.96, 0.96), + fillColor=None, + strokeWidth=top - btm, + ) + ) + + if track.greytrack_labels: + # Labels are required for this track + labelstep = self.length // track.greytrack_labels # label interval + for pos in range(self.start, self.end, labelstep): + label = String( + 0, + 0, + track.name, # Add a new label at + fontName=track.greytrack_font, # each interval + fontSize=track.greytrack_fontsize, + fillColor=track.greytrack_fontcolor, + ) + theta, costheta, sintheta = self.canvas_angle(pos) + if theta < startangle or endangle < theta: + continue + x, y = ( + self.xcenter + btm * sintheta, + self.ycenter + btm * costheta, + ) # start text halfway up marker + labelgroup = Group(label) + labelangle = ( + self.sweep * 2 * pi * (pos - self.start) / self.length - pi / 2 + ) + if theta > pi: + label.textAnchor = "end" # Anchor end of text to inner radius + labelangle += pi # and reorient it + cosA, sinA = cos(labelangle), sin(labelangle) + labelgroup.transform = (cosA, -sinA, sinA, cosA, x, y) + if not self.length - x <= labelstep: # Don't overrun the circle + greytrack_labels.append(labelgroup) + + return greytrack_bgs, greytrack_labels + + def canvas_angle(self, base): + """Given base-pair position, return (angle, cosine, sin) (PRIVATE).""" + angle = self.sweep * 2 * pi * (base - self.start) / self.length + return (angle, cos(angle), sin(angle)) + + def _draw_sigil_box( + self, bottom, center, top, startangle, endangle, strand, **kwargs + ): + """Draw BOX sigil (PRIVATE).""" + if strand == 1: + inner_radius = center + outer_radius = top + elif strand == -1: + inner_radius = bottom + outer_radius = center + else: + inner_radius = bottom + outer_radius = top + return self._draw_arc( + inner_radius, outer_radius, startangle, endangle, **kwargs + ) + + def _draw_arc( + self, + inner_radius, + outer_radius, + startangle, + endangle, + color, + border=None, + colour=None, + **kwargs + ): + """Return closed path describing an arc box (PRIVATE). + + Arguments: + - inner_radius Float distance of inside of arc from drawing center + - outer_radius Float distance of outside of arc from drawing center + - startangle Float angle subtended by start of arc at drawing center + (in radians) + - endangle Float angle subtended by end of arc at drawing center + (in radians) + - color colors.Color object for arc (overridden by backwards + compatible argument with UK spelling, colour). + + Returns a closed path object describing an arced box corresponding to + the passed values. For very small angles, a simple four sided + polygon is used. + """ + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + + strokecolor, color = _stroke_and_fill_colors(color, border) + + if abs(float(endangle - startangle)) > 0.01: + # Wide arc, must use full curves + p = ArcPath(strokeColor=strokecolor, fillColor=color, strokewidth=0) + # Note reportlab counts angles anti-clockwise from the horizontal + # (as in mathematics, e.g. complex numbers and polar coordinates) + # but we use clockwise from the vertical. Also reportlab uses + # degrees, but we use radians. + p.addArc( + self.xcenter, + self.ycenter, + inner_radius, + 90 - (endangle * 180 / pi), + 90 - (startangle * 180 / pi), + moveTo=True, + ) + p.addArc( + self.xcenter, + self.ycenter, + outer_radius, + 90 - (endangle * 180 / pi), + 90 - (startangle * 180 / pi), + reverse=True, + ) + p.closePath() + return p + else: + # Cheat and just use a four sided polygon. + # Calculate trig values for angle and coordinates + startcos, startsin = cos(startangle), sin(startangle) + endcos, endsin = cos(endangle), sin(endangle) + x0, y0 = self.xcenter, self.ycenter # origin of the circle + x1, y1 = (x0 + inner_radius * startsin, y0 + inner_radius * startcos) + x2, y2 = (x0 + inner_radius * endsin, y0 + inner_radius * endcos) + x3, y3 = (x0 + outer_radius * endsin, y0 + outer_radius * endcos) + x4, y4 = (x0 + outer_radius * startsin, y0 + outer_radius * startcos) + return draw_polygon([(x1, y1), (x2, y2), (x3, y3), (x4, y4)], color, border) + + def _draw_arc_line( + self, path, start_radius, end_radius, start_angle, end_angle, move=False + ): + """Add a list of points to a path object (PRIVATE). + + Assumes angles given are in degrees! + + Represents what would be a straight line on a linear diagram. + """ + x0, y0 = self.xcenter, self.ycenter # origin of the circle + radius_diff = end_radius - start_radius + angle_diff = end_angle - start_angle + dx = 0.01 # heuristic + a = start_angle * pi / 180 + if move: + path.moveTo(x0 + start_radius * cos(a), y0 + start_radius * sin(a)) + else: + path.lineTo(x0 + start_radius * cos(a), y0 + start_radius * sin(a)) + x = dx + if 0.01 <= abs(dx): + while x < 1: + r = start_radius + x * radius_diff + a = ( + (start_angle + x * (angle_diff)) * pi / 180 + ) # to radians for sin/cos + # print(x0+r*cos(a), y0+r*sin(a)) + path.lineTo(x0 + r * cos(a), y0 + r * sin(a)) + x += dx + a = end_angle * pi / 180 + path.lineTo(x0 + end_radius * cos(a), y0 + end_radius * sin(a)) + + def _draw_arc_poly( + self, + inner_radius, + outer_radius, + inner_startangle, + inner_endangle, + outer_startangle, + outer_endangle, + color, + border=None, + flip=False, + **kwargs + ): + """Return polygon path describing an arc.""" + strokecolor, color = _stroke_and_fill_colors(color, border) + + x0, y0 = self.xcenter, self.ycenter # origin of the circle + if ( + abs(inner_endangle - outer_startangle) > 0.01 + or abs(outer_endangle - inner_startangle) > 0.01 + or abs(inner_startangle - outer_startangle) > 0.01 + or abs(outer_startangle - outer_startangle) > 0.01 + ): + # Wide arc, must use full curves + p = ArcPath( + strokeColor=strokecolor, + fillColor=color, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + ) + # Note reportlab counts angles anti-clockwise from the horizontal + # (as in mathematics, e.g. complex numbers and polar coordinates) + # but we use clockwise from the vertical. Also reportlab uses + # degrees, but we use radians. + i_start = 90 - (inner_startangle * 180 / pi) + i_end = 90 - (inner_endangle * 180 / pi) + o_start = 90 - (outer_startangle * 180 / pi) + o_end = 90 - (outer_endangle * 180 / pi) + p.addArc(x0, y0, inner_radius, i_end, i_start, moveTo=True, reverse=True) + if flip: + # Flipped, join end to start, + self._draw_arc_line(p, inner_radius, outer_radius, i_end, o_start) + p.addArc(x0, y0, outer_radius, o_end, o_start, reverse=True) + self._draw_arc_line(p, outer_radius, inner_radius, o_end, i_start) + else: + # Not flipped, join start to start, end to end + self._draw_arc_line(p, inner_radius, outer_radius, i_end, o_end) + p.addArc(x0, y0, outer_radius, o_end, o_start, reverse=False) + self._draw_arc_line(p, outer_radius, inner_radius, o_start, i_start) + p.closePath() + return p + else: + # Cheat and just use a four sided polygon. + # Calculate trig values for angle and coordinates + inner_startcos, inner_startsin = ( + cos(inner_startangle), + sin(inner_startangle), + ) + inner_endcos, inner_endsin = cos(inner_endangle), sin(inner_endangle) + outer_startcos, outer_startsin = ( + cos(outer_startangle), + sin(outer_startangle), + ) + outer_endcos, outer_endsin = cos(outer_endangle), sin(outer_endangle) + x1, y1 = ( + x0 + inner_radius * inner_startsin, + y0 + inner_radius * inner_startcos, + ) + x2, y2 = ( + x0 + inner_radius * inner_endsin, + y0 + inner_radius * inner_endcos, + ) + x3, y3 = ( + x0 + outer_radius * outer_endsin, + y0 + outer_radius * outer_endcos, + ) + x4, y4 = ( + x0 + outer_radius * outer_startsin, + y0 + outer_radius * outer_startcos, + ) + return draw_polygon( + [(x1, y1), (x2, y2), (x3, y3), (x4, y4)], + color, + border, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + ) + + def _draw_sigil_cut_corner_box( + self, + bottom, + center, + top, + startangle, + endangle, + strand, + color, + border=None, + corner=0.5, + **kwargs + ): + """Draw OCTO sigil, box with corners cut off (PRIVATE).""" + if strand == 1: + inner_radius = center + outer_radius = top + elif strand == -1: + inner_radius = bottom + outer_radius = center + else: + inner_radius = bottom + outer_radius = top + + strokecolor, color = _stroke_and_fill_colors(color, border) + + startangle, endangle = min(startangle, endangle), max(startangle, endangle) + angle = float(endangle - startangle) + + middle_radius = 0.5 * (inner_radius + outer_radius) + boxheight = outer_radius - inner_radius + + corner_len = min(0.5 * boxheight, 0.5 * boxheight * corner) + shaft_inner_radius = inner_radius + corner_len + shaft_outer_radius = outer_radius - corner_len + + cornerangle_delta = max( + 0.0, min(abs(boxheight) * 0.5 * corner / middle_radius, abs(angle * 0.5)) + ) + if angle < 0: + cornerangle_delta *= -1 # reverse it + + # Calculate trig values for angle and coordinates + startcos, startsin = cos(startangle), sin(startangle) + endcos, endsin = cos(endangle), sin(endangle) + x0, y0 = self.xcenter, self.ycenter # origin of the circle + p = ArcPath( + strokeColor=strokecolor, + fillColor=color, + strokeLineJoin=1, # 1=round + strokewidth=0, + **kwargs + ) + # Inner curved edge + p.addArc( + self.xcenter, + self.ycenter, + inner_radius, + 90 - ((endangle - cornerangle_delta) * 180 / pi), + 90 - ((startangle + cornerangle_delta) * 180 / pi), + moveTo=True, + ) + # Corner edge - straight lines assumes small angle! + # TODO - Use self._draw_arc_line(p, ...) here if we expose corner setting + p.lineTo(x0 + shaft_inner_radius * startsin, y0 + shaft_inner_radius * startcos) + p.lineTo(x0 + shaft_outer_radius * startsin, y0 + shaft_outer_radius * startcos) + # Outer curved edge + p.addArc( + self.xcenter, + self.ycenter, + outer_radius, + 90 - ((endangle - cornerangle_delta) * 180 / pi), + 90 - ((startangle + cornerangle_delta) * 180 / pi), + reverse=True, + ) + # Corner edges + p.lineTo(x0 + shaft_outer_radius * endsin, y0 + shaft_outer_radius * endcos) + p.lineTo(x0 + shaft_inner_radius * endsin, y0 + shaft_inner_radius * endcos) + p.closePath() + return p + + def _draw_sigil_arrow( + self, bottom, center, top, startangle, endangle, strand, **kwargs + ): + """Draw ARROW sigil (PRIVATE).""" + if strand == 1: + inner_radius = center + outer_radius = top + orientation = "right" + elif strand == -1: + inner_radius = bottom + outer_radius = center + orientation = "left" + else: + inner_radius = bottom + outer_radius = top + orientation = "right" # backwards compatibility + return self._draw_arc_arrow( + inner_radius, + outer_radius, + startangle, + endangle, + orientation=orientation, + **kwargs + ) + + def _draw_sigil_big_arrow( + self, bottom, center, top, startangle, endangle, strand, **kwargs + ): + """Draw BIGARROW sigil, like ARROW but straddles the axis (PRIVATE).""" + if strand == -1: + orientation = "left" + else: + orientation = "right" + return self._draw_arc_arrow( + bottom, top, startangle, endangle, orientation=orientation, **kwargs + ) + + def _draw_arc_arrow( + self, + inner_radius, + outer_radius, + startangle, + endangle, + color, + border=None, + shaft_height_ratio=0.4, + head_length_ratio=0.5, + orientation="right", + colour=None, + **kwargs + ): + """Draw an arrow along an arc (PRIVATE).""" + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + + strokecolor, color = _stroke_and_fill_colors(color, border) + + # if orientation == 'right': + # startangle, endangle = min(startangle, endangle), max(startangle, endangle) + # elif orientation == 'left': + # startangle, endangle = max(startangle, endangle), min(startangle, endangle) + # else: + startangle, endangle = min(startangle, endangle), max(startangle, endangle) + if orientation != "left" and orientation != "right": + raise ValueError( + "Invalid orientation %r, should be 'left' or 'right'" % orientation + ) + + angle = float(endangle - startangle) # angle subtended by arc + middle_radius = 0.5 * (inner_radius + outer_radius) + boxheight = outer_radius - inner_radius + shaft_height = boxheight * shaft_height_ratio + shaft_inner_radius = middle_radius - 0.5 * shaft_height + shaft_outer_radius = middle_radius + 0.5 * shaft_height + headangle_delta = max( + 0.0, min(abs(boxheight) * head_length_ratio / middle_radius, abs(angle)) + ) + if angle < 0: + headangle_delta *= -1 # reverse it + if orientation == "right": + headangle = endangle - headangle_delta + else: + headangle = startangle + headangle_delta + if startangle <= endangle: + headangle = max(min(headangle, endangle), startangle) + else: + headangle = max(min(headangle, startangle), endangle) + if not ( + startangle <= headangle <= endangle or endangle <= headangle <= startangle + ): + raise RuntimeError( + "Problem drawing arrow, invalid positions. " + "Start angle: %s, Head angle: %s, " + "End angle: %s, Angle: %s" % (startangle, headangle, endangle, angle) + ) + + # Calculate trig values for angle and coordinates + startcos, startsin = cos(startangle), sin(startangle) + headcos, headsin = cos(headangle), sin(headangle) + endcos, endsin = cos(endangle), sin(endangle) + x0, y0 = self.xcenter, self.ycenter # origin of the circle + if 0.5 >= abs(angle) and abs(headangle_delta) >= abs(angle): + # If the angle is small, and the arrow is all head, + # cheat and just use a triangle. + if orientation == "right": + x1, y1 = (x0 + inner_radius * startsin, y0 + inner_radius * startcos) + x2, y2 = (x0 + outer_radius * startsin, y0 + outer_radius * startcos) + x3, y3 = (x0 + middle_radius * endsin, y0 + middle_radius * endcos) + else: + x1, y1 = (x0 + inner_radius * endsin, y0 + inner_radius * endcos) + x2, y2 = (x0 + outer_radius * endsin, y0 + outer_radius * endcos) + x3, y3 = (x0 + middle_radius * startsin, y0 + middle_radius * startcos) + # return draw_polygon([(x1,y1),(x2,y2),(x3,y3)], color, border, + # stroke_line_join=1) + return Polygon( + [x1, y1, x2, y2, x3, y3], + strokeColor=border or color, + fillColor=color, + strokeLineJoin=1, # 1=round, not mitre! + strokewidth=0, + ) + elif orientation == "right": + p = ArcPath( + strokeColor=strokecolor, + fillColor=color, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + **kwargs + ) + # Note reportlab counts angles anti-clockwise from the horizontal + # (as in mathematics, e.g. complex numbers and polar coordinates) + # but we use clockwise from the vertical. Also reportlab uses + # degrees, but we use radians. + p.addArc( + self.xcenter, + self.ycenter, + shaft_inner_radius, + 90 - (headangle * 180 / pi), + 90 - (startangle * 180 / pi), + moveTo=True, + ) + p.addArc( + self.xcenter, + self.ycenter, + shaft_outer_radius, + 90 - (headangle * 180 / pi), + 90 - (startangle * 180 / pi), + reverse=True, + ) + if abs(angle) < 0.5: + p.lineTo(x0 + outer_radius * headsin, y0 + outer_radius * headcos) + p.lineTo(x0 + middle_radius * endsin, y0 + middle_radius * endcos) + p.lineTo(x0 + inner_radius * headsin, y0 + inner_radius * headcos) + else: + self._draw_arc_line( + p, + outer_radius, + middle_radius, + 90 - (headangle * 180 / pi), + 90 - (endangle * 180 / pi), + ) + self._draw_arc_line( + p, + middle_radius, + inner_radius, + 90 - (endangle * 180 / pi), + 90 - (headangle * 180 / pi), + ) + p.closePath() + return p + else: + p = ArcPath( + strokeColor=strokecolor, + fillColor=color, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + **kwargs + ) + # Note reportlab counts angles anti-clockwise from the horizontal + # (as in mathematics, e.g. complex numbers and polar coordinates) + # but we use clockwise from the vertical. Also reportlab uses + # degrees, but we use radians. + p.addArc( + self.xcenter, + self.ycenter, + shaft_inner_radius, + 90 - (endangle * 180 / pi), + 90 - (headangle * 180 / pi), + moveTo=True, + reverse=True, + ) + p.addArc( + self.xcenter, + self.ycenter, + shaft_outer_radius, + 90 - (endangle * 180 / pi), + 90 - (headangle * 180 / pi), + reverse=False, + ) + # Note - two staight lines is only a good approximation for small + # head angle, in general will need to curved lines here: + if abs(angle) < 0.5: + p.lineTo(x0 + outer_radius * headsin, y0 + outer_radius * headcos) + p.lineTo(x0 + middle_radius * startsin, y0 + middle_radius * startcos) + p.lineTo(x0 + inner_radius * headsin, y0 + inner_radius * headcos) + else: + self._draw_arc_line( + p, + outer_radius, + middle_radius, + 90 - (headangle * 180 / pi), + 90 - (startangle * 180 / pi), + ) + self._draw_arc_line( + p, + middle_radius, + inner_radius, + 90 - (startangle * 180 / pi), + 90 - (headangle * 180 / pi), + ) + p.closePath() + return p + + def _draw_sigil_jaggy( + self, + bottom, + center, + top, + startangle, + endangle, + strand, + color, + border=None, + **kwargs + ): + """Draw JAGGY sigil (PRIVATE). + + Although we may in future expose the head/tail jaggy lengths, for now + both the left and right edges are drawn jagged. + """ + if strand == 1: + inner_radius = center + outer_radius = top + teeth = 2 + elif strand == -1: + inner_radius = bottom + outer_radius = center + teeth = 2 + else: + inner_radius = bottom + outer_radius = top + teeth = 4 + + # TODO, expose these settings? + tail_length_ratio = 1.0 + head_length_ratio = 1.0 + + strokecolor, color = _stroke_and_fill_colors(color, border) + + startangle, endangle = min(startangle, endangle), max(startangle, endangle) + angle = float(endangle - startangle) # angle subtended by arc + height = outer_radius - inner_radius + + assert startangle <= endangle and angle >= 0 + if head_length_ratio and tail_length_ratio: + headangle = max( + endangle + - min(height * head_length_ratio / (center * teeth), angle * 0.5), + startangle, + ) + tailangle = min( + startangle + + min(height * tail_length_ratio / (center * teeth), angle * 0.5), + endangle, + ) + # With very small features, can due to floating point calculations + # violate the assertion below that start <= tail <= head <= end + tailangle = min(tailangle, headangle) + elif head_length_ratio: + headangle = max( + endangle - min(height * head_length_ratio / (center * teeth), angle), + startangle, + ) + tailangle = startangle + else: + headangle = endangle + tailangle = min( + startangle + min(height * tail_length_ratio / (center * teeth), angle), + endangle, + ) + + if not startangle <= tailangle <= headangle <= endangle: + raise RuntimeError( + "Problem drawing jaggy sigil, invalid " + "positions. Start angle: %s, " + "Tail angle: %s, Head angle: %s, End angle %s, " + "Angle: %s" % (startangle, tailangle, headangle, endangle, angle) + ) + + # Calculate trig values for angle and coordinates + startcos, startsin = cos(startangle), sin(startangle) + headcos, headsin = cos(headangle), sin(headangle) + endcos, endsin = cos(endangle), sin(endangle) + x0, y0 = self.xcenter, self.ycenter # origin of the circle + + p = ArcPath( + strokeColor=strokecolor, + fillColor=color, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + **kwargs + ) + # Note reportlab counts angles anti-clockwise from the horizontal + # (as in mathematics, e.g. complex numbers and polar coordinates) + # but we use clockwise from the vertical. Also reportlab uses + # degrees, but we use radians. + p.addArc( + self.xcenter, + self.ycenter, + inner_radius, + 90 - (headangle * 180 / pi), + 90 - (tailangle * 180 / pi), + moveTo=True, + ) + for i in range(0, teeth): + p.addArc( + self.xcenter, + self.ycenter, + inner_radius + i * height / teeth, + 90 - (tailangle * 180 / pi), + 90 - (startangle * 180 / pi), + ) + # Curved line needed when drawing long jaggies + self._draw_arc_line( + p, + inner_radius + i * height / teeth, + inner_radius + (i + 1) * height / teeth, + 90 - (startangle * 180 / pi), + 90 - (tailangle * 180 / pi), + ) + p.addArc( + self.xcenter, + self.ycenter, + outer_radius, + 90 - (headangle * 180 / pi), + 90 - (tailangle * 180 / pi), + reverse=True, + ) + for i in range(0, teeth): + p.addArc( + self.xcenter, + self.ycenter, + outer_radius - i * height / teeth, + 90 - (endangle * 180 / pi), + 90 - (headangle * 180 / pi), + reverse=True, + ) + # Curved line needed when drawing long jaggies + self._draw_arc_line( + p, + outer_radius - i * height / teeth, + outer_radius - (i + 1) * height / teeth, + 90 - (endangle * 180 / pi), + 90 - (headangle * 180 / pi), + ) + p.closePath() + return p diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Colors.py b/code/lib/Bio/Graphics/GenomeDiagram/_Colors.py new file mode 100644 index 0000000..a37e107 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_Colors.py @@ -0,0 +1,234 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ + +"""Colors module. + +Provides: + +- ColorTranslator - class to convert tuples of integers and floats into + colors.Color objects + +For drawing capabilities, this module uses reportlab to define colors: +http://www.reportlab.com +""" + +# ReportLab imports +from reportlab.lib import colors + + +class ColorTranslator: + """Class providing methods for translating representations of color into. + + Examples + -------- + >>> from Bio.Graphics import GenomeDiagram + >>> gdct=GenomeDiagram._Colors.ColorTranslator() + >>> print(gdct.float1_color((0.5, 0.5, 0.5))) + Color(.5,.5,.5,1) + >>> print(gdct.int255_color((1, 75, 240))) + Color(.003922,.294118,.941176,1) + >>> print(gdct.artemis_color(7)) + Color(1,1,0,1) + >>> print(gdct.scheme_color(2)) + Color(1,0,0,1) + >>> gdct.get_artemis_colorscheme() + {0: (Color(1,1,1,1), 'pathogenicity, adaptation, chaperones'), 1: (Color(.39,.39,.39,1), 'energy metabolism'), 2: (Color(1,0,0,1), 'information transfer'), 3: (Color(0,1,0,1), 'surface'), 4: (Color(0,0,1,1), 'stable RNA'), 5: (Color(0,1,1,1), 'degradation of large molecules'), 6: (Color(1,0,1,1), 'degradation of small molecules'), 7: (Color(1,1,0,1), 'central/intermediary/miscellaneous metabolism'), 8: (Color(.6,.98,.6,1), 'unknown'), 9: (Color(.53,.81,.98,1), 'regulators'), 10: (Color(1,.65,0,1), 'conserved hypotheticals'), 11: (Color(.78,.59,.39,1), 'pseudogenes and partial genes'), 12: (Color(1,.78,.78,1), 'phage/IS elements'), 13: (Color(.7,.7,.7,1), 'some miscellaneous information'), 14: (Color(0,0,0,1), ''), 15: (Color(1,.25,.25,1), 'secondary metabolism'), 16: (Color(1,.5,.5,1), ''), 17: (Color(1,.75,.75,1), '')} + + >>> print(gdct.translate((0.5, 0.5, 0.5))) + Color(.5,.5,.5,1) + >>> print(gdct.translate((1, 75, 240))) + Color(.003922,.294118,.941176,1) + >>> print(gdct.translate(7)) + Color(1,1,0,1) + >>> print(gdct.translate(2)) + Color(1,0,0,1) + + """ + + def __init__(self, filename=None): + """Initialize. + + Argument filename is the location of a file containing + colorscheme information. + """ + self._artemis_colorscheme = { + 0: (colors.Color(1, 1, 1), "pathogenicity, adaptation, chaperones"), + 1: (colors.Color(0.39, 0.39, 0.39), "energy metabolism"), + 2: (colors.Color(1, 0, 0), "information transfer"), + 3: (colors.Color(0, 1, 0), "surface"), + 4: (colors.Color(0, 0, 1), "stable RNA"), + 5: (colors.Color(0, 1, 1), "degradation of large molecules"), + 6: (colors.Color(1, 0, 1), "degradation of small molecules"), + 7: (colors.Color(1, 1, 0), "central/intermediary/miscellaneous metabolism"), + 8: (colors.Color(0.60, 0.98, 0.60), "unknown"), + 9: (colors.Color(0.53, 0.81, 0.98), "regulators"), + 10: (colors.Color(1, 0.65, 0), "conserved hypotheticals"), + 11: (colors.Color(0.78, 0.59, 0.39), "pseudogenes and partial genes"), + 12: (colors.Color(1, 0.78, 0.78), "phage/IS elements"), + 13: (colors.Color(0.70, 0.70, 0.70), "some miscellaneous information"), + 14: (colors.Color(0, 0, 0), ""), + 15: (colors.Color(1, 0.25, 0.25), "secondary metabolism"), + 16: (colors.Color(1, 0.5, 0.5), ""), + 17: (colors.Color(1, 0.75, 0.75), ""), + } # Hardwired Artemis color scheme + self._colorscheme = {} + if filename is not None: + self.read_colorscheme(filename) # Imported color scheme + else: + self._colorscheme = self._artemis_colorscheme + + def translate(self, color=None, colour=None): + """Translate a color into a ReportLab Color object. + + Arguments: + - color - Color defined as an int, a tuple of three ints 0->255 + or a tuple of three floats 0 -> 1, or a string giving + one of the named colors defined by ReportLab, or a + ReportLab color object (returned as is). + - colour - Backwards compatible alias using UK spelling (which + will over-ride any color argument). + + Returns a colors.Color object, determined semi-intelligently + depending on the input values + """ + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + + if color is None: + raise ValueError("Passed color (or colour) must be a valid color type") + elif isinstance(color, int): + color = self.scheme_color(color) + elif isinstance(color, colors.Color): + return color + elif isinstance(color, str): + # Assume its a named reportlab color like "red". + color = colors.toColor(color) + elif isinstance(color, tuple) and isinstance(color[0], float): + color = self.float1_color(color) + elif isinstance(color, tuple) and isinstance(color[0], int): + color = self.int255_color(color) + return color + + def read_colorscheme(self, filename): + r"""Load colour scheme from file. + + Reads information from a file containing color information and stores + it internally. + + Argument filename is the location of a file defining colors in + tab-separated format plaintext as:: + + INT \t RED \t GREEN \t BLUE \t Comment + + Where RED, GREEN and BLUE are intensities in the range 0 -> 255, e.g.:: + + 2 \t 255 \t 0 \t 0 \t Red: Information transfer + + """ + with open(filename).readlines() as lines: + for line in lines: + data = line.strip().split("\t") + try: + label = int(data[0]) + red, green, blue = int(data[1]), int(data[2]), int(data[3]) + if len(data) > 4: + comment = data[4] + else: + comment = "" + self._colorscheme[label] = ( + self.int255_color((red, green, blue)), + comment, + ) + except ValueError: + raise ValueError( + "Expected INT \t INT \t INT \t INT \t string input" + ) from None + + def get_artemis_colorscheme(self): + """Return the Artemis color scheme as a dictionary.""" + return self._artemis_colorscheme + + def artemis_color(self, value): + """Artemis color (integer) to ReportLab Color object. + + Arguments: + - value: An int representing a functional class in the Artemis + color scheme (see www.sanger.ac.uk for a description), + or a string from a GenBank feature annotation for the + color which may be dot delimited (in which case the + first value is used). + + Takes an int representing a functional class in the Artemis color + scheme, and returns the appropriate colors.Color object + """ + try: + value = int(value) + except ValueError: + if value.count("."): # dot-delimited + value = int(value.split(".", 1)[0]) # Use only first integer + else: + raise + if value in self._artemis_colorscheme: + return self._artemis_colorscheme[value][0] + else: + raise ValueError("Artemis color out of range: %d" % value) + + def get_colorscheme(self): + """Return the user-defined color scheme as a dictionary.""" + return self._colorscheme + + def scheme_color(self, value): + """Map a user-defined color integer to a ReportLab Color object. + + - value: An int representing a single color in the user-defined + color scheme + + Takes an int representing a user-defined color and returns the + appropriate colors.Color object. + """ + if value in self._colorscheme: + return self._colorscheme[value][0] + else: + raise ValueError("Scheme color out of range: %d" % value) + + def int255_color(self, values): + """Map integer (red, green, blue) tuple to a ReportLab Color object. + + - values: A tuple of (red, green, blue) intensities as + integers in the range 0->255 + + Takes a tuple of (red, green, blue) intensity values in the range + 0 -> 255 and returns an appropriate colors.Color object. + """ + red, green, blue = values + factor = 1 / 255.0 + red, green, blue = red * factor, green * factor, blue * factor + return colors.Color(red, green, blue) + + def float1_color(self, values): + """Map float (red, green, blue) tuple to a ReportLab Color object. + + - values: A tuple of (red, green, blue) intensities as floats + in the range 0 -> 1 + + Takes a tuple of (red, green, blue) intensity values in the range + 0 -> 1 and returns an appropriate colors.Color object. + """ + red, green, blue = values + return colors.Color(red, green, blue) + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest(verbose=2) diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_CrossLink.py b/code/lib/Bio/Graphics/GenomeDiagram/_CrossLink.py new file mode 100644 index 0000000..7958de4 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_CrossLink.py @@ -0,0 +1,100 @@ +# Copyright 2011-2017 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Genome Diagram Feature cross-link module.""" + +from reportlab.lib import colors + + +class CrossLink: + """Hold information for drawing a cross link between features.""" + + def __init__( + self, featureA, featureB, color=colors.lightgreen, border=None, flip=False + ): + """Create a new cross link. + + Arguments featureA and featureB should GenomeDiagram feature objects, + or 3-tuples (track object, start, end), and currently must be on + different tracks. + + The color and border arguments should be ReportLab colour objects, or + for border use a boolean False for no border, otherwise it defaults to + the same as the main colour. + + The flip argument draws an inverted cross link, useful for showing a + mapping where one sequence has been reversed. It is conventional to + also use a different colour (e.g. red for simple links, blue for any + flipped links). + """ + # Initialize attributes + self.featureA = featureA + self.featureB = featureB + self.color = color # default color to draw the feature + self.border = border + self.flip = flip + + @property + def startA(self): + """Start position of Feature A.""" + try: + return self.featureA.start + except AttributeError: + track, start, end = self.featureA + return start + + @property + def endA(self): + """End position of Feature A.""" + try: + return self.featureA.end + except AttributeError: + track, start, end = self.featureA + return end + + def _trackA(self, tracks): + try: + track, start, end = self.featureA + assert track in tracks + return track + except TypeError: + for track in tracks: + for feature_set in track.get_sets(): + if hasattr(feature_set, "features"): + if self.featureA in feature_set.features.values(): + return track + return None + + @property + def startB(self): + """Start position of Feature B.""" + try: + return self.featureB.start + except AttributeError: + track, start, end = self.featureB + return start + + @property + def endB(self): + """End position of Feature B.""" + try: + return self.featureB.end + except AttributeError: + track, start, end = self.featureB + return end + + def _trackB(self, tracks): + try: + track, start, end = self.featureB + assert track in tracks + return track + except TypeError: + for track in tracks: + for feature_set in track.get_sets(): + if hasattr(feature_set, "features"): + if self.featureB in feature_set.features.values(): + return track + return None diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Diagram.py b/code/lib/Bio/Graphics/GenomeDiagram/_Diagram.py new file mode 100644 index 0000000..fa44970 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_Diagram.py @@ -0,0 +1,411 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +"""Provides a container for information concerning the tracks to be drawn in a diagram. + +It also provides the interface for defining the diagram (possibly split these +functions in later version?). + +For drawing capabilities, this module uses reportlab to draw and write the +diagram: + +http://www.reportlab.com + +For dealing with biological information, the package expects BioPython +objects - namely SeqRecord objects containing SeqFeature objects. +""" + +try: + from reportlab.graphics import renderPM +except ImportError: + # This is an optional part of ReportLab, so may not be installed. + renderPM = None + +from ._LinearDrawer import LinearDrawer +from ._CircularDrawer import CircularDrawer +from ._Track import Track + +from Bio.Graphics import _write + + +def _first_defined(*args): + """Return the first non-null argument (PRIVATE).""" + for arg in args: + if arg is not None: + return arg + return None + + +class Diagram: + """Diagram container. + + Arguments: + - name - a string, identifier for the diagram. + - tracks - a list of Track objects comprising the diagram. + - format - a string, format of the diagram 'circular' or + 'linear', depending on the sort of diagram required. + - pagesize - a string, the pagesize of output describing the ISO + size of the image, or a tuple of pixels. + - orientation - a string describing the required orientation of the + final drawing ('landscape' or 'portrait'). + - x - a float (0->1), the proportion of the page to take + up with even X margins t the page. + - y - a float (0->1), the proportion of the page to take + up with even Y margins to the page. + - xl - a float (0->1), the proportion of the page to take + up with the left X margin to the page (overrides x). + - xr - a float (0->1), the proportion of the page to take + up with the right X margin to the page (overrides x). + - yt - a float (0->1), the proportion of the page to take + up with the top Y margin to the page (overrides y). + - yb - a float (0->1), the proportion of the page to take + up with the bottom Y margin to the page (overrides y). + - circle_core - a float, the proportion of the available radius to + leave empty at the center of a circular diagram (0 to 1). + - start - an integer, the base/aa position to start the diagram at. + - end - an integer, the base/aa position to end the diagram at. + - tracklines - a boolean, True if track guidelines are to be drawn. + - fragments - and integer, for a linear diagram, the number of equal + divisions into which the sequence is divided. + - fragment_size - a float (0->1), the proportion of the space + available to each fragment that should be used in drawing. + - track_size - a float (0->1), the proportion of the space + available to each track that should be used in drawing with sigils. + - circular - a boolean, True if the genome/sequence to be drawn + is, in reality, circular. + + """ + + def __init__( + self, + name=None, + format="circular", + pagesize="A3", + orientation="landscape", + x=0.05, + y=0.05, + xl=None, + xr=None, + yt=None, + yb=None, + start=None, + end=None, + tracklines=False, + fragments=10, + fragment_size=None, + track_size=0.75, + circular=True, + circle_core=0.0, + ): + """Initialize. + + gdd = Diagram(name=None) + """ + self.tracks = {} # Holds all Track objects, keyed by level + self.name = name # Description of the diagram + # Diagram page setup attributes + self.format = format + self.pagesize = pagesize + self.orientation = orientation + self.x = x + self.y = y + self.xl = xl + self.xr = xr + self.yt = yt + self.yb = yb + self.start = start + self.end = end + self.tracklines = tracklines + self.fragments = fragments + if fragment_size is not None: + self.fragment_size = fragment_size + else: + if self.fragments == 1: + # For single fragments, default to full height + self.fragment_size = 1 + else: + # Otherwise keep a 10% gap between fragments + self.fragment_size = 0.9 + self.track_size = track_size + self.circular = circular + self.circle_core = circle_core + self.cross_track_links = [] + self.drawing = None + + def set_all_tracks(self, attr, value): + """Set the passed attribute of all tracks in the set to the passed value. + + Arguments: + - attr - An attribute of the Track class. + - value - The value to set that attribute. + + set_all_tracks(self, attr, value) + """ + for track in self.tracks.values(): + if hasattr(track, attr): + # If the feature has the attribute set it to the passed value + setattr(track, attr, value) + + def draw( + self, + format=None, + pagesize=None, + orientation=None, + x=None, + y=None, + xl=None, + xr=None, + yt=None, + yb=None, + start=None, + end=None, + tracklines=None, + fragments=None, + fragment_size=None, + track_size=None, + circular=None, + circle_core=None, + cross_track_links=None, + ): + """Draw the diagram, with passed parameters overriding existing attributes. + + gdd.draw(format='circular') + """ + # Pass the parameters to the drawer objects that will build the + # diagrams. At the moment, we detect overrides with an or in the + # Instantiation arguments, but I suspect there's a neater way to do + # this. + if format == "linear": + drawer = LinearDrawer( + self, + _first_defined(pagesize, self.pagesize), + _first_defined(orientation, self.orientation), + _first_defined(x, self.x), + _first_defined(y, self.y), + _first_defined(xl, self.xl), + _first_defined(xr, self.xr), + _first_defined(yt, self.yt), + _first_defined(yb, self.yb), + _first_defined(start, self.start), + _first_defined(end, self.end), + _first_defined(tracklines, self.tracklines), + _first_defined(fragments, self.fragments), + _first_defined(fragment_size, self.fragment_size), + _first_defined(track_size, self.track_size), + _first_defined(cross_track_links, self.cross_track_links), + ) + else: + drawer = CircularDrawer( + self, + _first_defined(pagesize, self.pagesize), + _first_defined(orientation, self.orientation), + _first_defined(x, self.x), + _first_defined(y, self.y), + _first_defined(xl, self.xl), + _first_defined(xr, self.xr), + _first_defined(yt, self.yt), + _first_defined(yb, self.yb), + _first_defined(start, self.start), + _first_defined(end, self.end), + _first_defined(tracklines, self.tracklines), + _first_defined(track_size, self.track_size), + _first_defined(circular, self.circular), + _first_defined(circle_core, self.circle_core), + _first_defined(cross_track_links, self.cross_track_links), + ) + drawer.draw() # Tell the drawer to complete the drawing + self.drawing = drawer.drawing # Get the completed drawing + + def write(self, filename="test1.ps", output="PS", dpi=72): + """Write the drawn diagram to a specified file, in a specified format. + + Arguments: + - filename - a string indicating the name of the output file, + or a handle to write to. + - output - a string indicating output format, one of PS, PDF, + SVG, or provided the ReportLab renderPM module is installed, one + of the bitmap formats JPG, BMP, GIF, PNG, TIFF or TIFF. The + format can be given in upper or lower case. + - dpi - an integer. Resolution (dots per inch) for bitmap formats. + + Returns: + No return value. + + write(self, filename='test1.ps', output='PS', dpi=72) + + """ + return _write(self.drawing, filename, output, dpi=dpi) + + def write_to_string(self, output="PS", dpi=72): + """Return a byte string containing the diagram in the requested format. + + Arguments: + - output - a string indicating output format, one of PS, PDF, + SVG, JPG, BMP, GIF, PNG, TIFF or TIFF (as specified for the write + method). + - dpi - Resolution (dots per inch) for bitmap formats. + + Returns: + Return the completed drawing as a bytes string in a prescribed + format. + + """ + # The ReportLab drawToString method, which this function used to call, + # originally just used a StringIO handle with the drawToFile method. + # + # TODO - Rename this method to include keyword bytes? + from io import BytesIO + + handle = BytesIO() + self.write(handle, output, dpi) + return handle.getvalue() + + def add_track(self, track, track_level): + """Add a Track object to the diagram. + + It also accepts instructions to place it at a particular level on the + diagram. + + Arguments: + - track - Track object to draw. + - track_level - an integer. The level at which the track will be + drawn (above an arbitrary baseline). + + add_track(self, track, track_level) + """ + if track is None: + raise ValueError("Must specify track") + if track_level not in self.tracks: # No track at that level + self.tracks[track_level] = track # so just add it + else: # Already a track there, so shunt all higher tracks up one + occupied_levels = sorted( + self.get_levels() + ) # Get list of occupied levels... + occupied_levels.reverse() # ...reverse it (highest first) + for val in occupied_levels: + # If track value >= that to be added + if val >= track.track_level: + self.tracks[val + 1] = self.tracks[val] # ...increment by 1 + self.tracks[track_level] = track # And put the new track in + self.tracks[track_level].track_level = track_level + + def new_track(self, track_level, **args): + """Add a new Track to the diagram at a given level. + + The track is returned for further user manipulation. + + Arguments: + - track_level - an integer. The level at which the track will be + drawn (above an arbitrary baseline). + + new_track(self, track_level) + """ + newtrack = Track() + for key in args: + setattr(newtrack, key, args[key]) + if track_level not in self.tracks: # No track at that level + self.tracks[track_level] = newtrack # so just add it + else: # Already a track there, so shunt all higher tracks up one + occupied_levels = sorted( + self.get_levels() + ) # Get list of occupied levels... + occupied_levels.reverse() # ...reverse (highest first)... + for val in occupied_levels: + if val >= track_level: + # Track value >= that to be added, increment by 1 + self.tracks[val + 1] = self.tracks[val] + self.tracks[track_level] = newtrack # And put the new track in + self.tracks[track_level].track_level = track_level + return newtrack + + def del_track(self, track_level): + """Remove the track to be drawn at a particular level on the diagram. + + Arguments: + - track_level - an integer. The level of the track on the diagram + to delete. + + del_track(self, track_level) + """ + del self.tracks[track_level] + + def get_tracks(self): + """Return a list of the tracks contained in the diagram.""" + return list(self.tracks.values()) + + def move_track(self, from_level, to_level): + """Move a track from one level on the diagram to another. + + Arguments: + - from_level - an integer. The level at which the track to be + moved is found. + - to_level - an integer. The level to move the track to. + + """ + aux = self.tracks[from_level] + del self.tracks[from_level] + self.add_track(aux, to_level) + + def renumber_tracks(self, low=1, step=1): + """Renumber all tracks consecutively. + + Optionally from a passed lowest number. + + Arguments: + - low - an integer. The track number to start from. + - step - an integer. The track interval for separation of + tracks. + + """ + track = low # Start numbering from here + levels = self.get_levels() + + conversion = {} # Holds new set of levels + for level in levels: # Starting at low... + conversion[track] = self.tracks[level] # Add old tracks to new set + conversion[track].track_level = track + track += step # step interval + self.tracks = conversion # Replace old set of levels with new set + + def get_levels(self): + """Return a sorted list of levels occupied by tracks in the diagram.""" + return sorted(self.tracks) + + def get_drawn_levels(self): + """Return a sorted list of levels occupied by tracks. + + These tracks are not explicitly hidden. + """ + return sorted(key for key in self.tracks if not self.tracks[key].hide) + + def range(self): + """Return lowest and highest base numbers from track features. + + Returned type is a tuple. + """ + lows, highs = [], [] + for track in self.tracks.values(): # Get ranges for each track + low, high = track.range() + lows.append(low) + highs.append(high) + return min(lows), max(highs) # Return extremes from all tracks + + def __getitem__(self, key): + """Return the track contained at the level of the passed key.""" + return self.tracks[key] + + def __str__(self): + """Return a formatted string describing the diagram.""" + outstr = ["\n<%s: %s>" % (self.__class__, self.name)] + outstr.append("%d tracks" % len(self.tracks)) + for level in self.get_levels(): + outstr.append("Track %d: %s\n" % (level, self.tracks[level])) + outstr = "\n".join(outstr) + return outstr diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Feature.py b/code/lib/Bio/Graphics/GenomeDiagram/_Feature.py new file mode 100644 index 0000000..87be16e --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_Feature.py @@ -0,0 +1,198 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ + +"""Feature module. + +Provides: + - Feature - class to wrap Bio.SeqFeature objects with drawing information + +For drawing capabilities, this module uses reportlab to define colors: +http://www.reportlab.com +""" + +# ReportLab imports +from reportlab.lib import colors + +# GenomeDiagram imports +from ._Colors import ColorTranslator + + +class Feature: + """Class to wrap Bio.SeqFeature objects for GenomeDiagram. + + Attributes: + - parent FeatureSet, container for the object + - id Unique id + - color color.Color, color to draw the feature + - hide Boolean for whether the feature will be drawn or not + - sigil String denoting the type of sigil to use for the feature. + Currently either "BOX" or "ARROW" are supported. + - arrowhead_length Float denoting length of the arrow head to be drawn, + relative to the bounding box height. The arrow shaft + takes up the remainder of the bounding box's length. + - arrowshaft_height Float denoting length of the representative arrow + shaft to be drawn, relative to the bounding box height. + The arrow head takes the full height of the bound box. + - name_qualifiers List of Strings, describes the qualifiers that may + contain feature names in the wrapped Bio.SeqFeature object + - label Boolean, 1 if the label should be shown + - label_font String describing the font to use for the feature label + - label_size Int describing the feature label font size + - label_color color.Color describing the feature label color + - label_angle Float describing the angle through which to rotate the + feature label in degrees (default = 45, linear only) + - label_position String, 'start', 'end' or 'middle' denoting where + to place the feature label. Leave as None for the default + which is 'start' for linear diagrams, and at the bottom of + the feature as drawn on circular diagrams. + - label_strand Integer -1 or +1 to explicitly place the label on the + forward or reverse strand. Default (None) follows th + feature's strand. Use -1 to put labels under (linear) or + inside (circular) the track, +1 to put them above (linear) + or outside (circular) the track. + - locations List of tuples of (start, end) ints describing where the + feature and any subfeatures start and end + - type String denoting the feature type + - name String denoting the feature name + - strand Int describing the strand on which the feature is found + + """ + + def __init__( + self, + parent=None, + feature_id=None, + feature=None, + color=colors.lightgreen, + label=0, + border=None, + colour=None, + ): + """Initialize. + + Arguments: + - parent FeatureSet containing the feature + - feature_id Unique id for the feature + - feature Bio.SeqFeature object to be wrapped + - color color.Color Color to draw the feature (overridden + by backwards compatible argument with UK spelling, colour). + Either argument is overridden if 'color' is found in feature + qualifiers + - border color.Color Color to draw the feature border, use + None for the same as the fill color, False for no border. + - label Boolean, 1 if the label should be shown + + """ + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + + self._colortranslator = ColorTranslator() + + # Initialize attributes + self.parent = parent + self.id = feature_id + self.color = color # default color to draw the feature + self.border = border + self._feature = None # Bio.SeqFeature object to wrap + self.hide = 0 # show by default + self.sigil = "BOX" + self.arrowhead_length = 0.5 # 50% of the box height + self.arrowshaft_height = 0.4 # 40% of the box height + self.name_qualifiers = ["gene", "label", "name", "locus_tag", "product"] + self.label = label + self.label_font = "Helvetica" + self.label_size = 6 + self.label_color = colors.black + self.label_angle = 45 + self.label_position = None # Expect 'start', 'middle', or 'end' (plus aliases) + self.label_strand = None # Expect +1 or -1 if overriding this + + if feature is not None: + self.set_feature(feature) + + def set_feature(self, feature): + """Define the Bio.SeqFeature object to be wrapped.""" + self._feature = feature + self.__process_feature() + + def __process_feature(self): + """Examine wrapped feature and set some properties accordingly (PRIVATE).""" + self.locations = [] + bounds = [] + # This will be a list of length one for simple FeatureLocation: + for location in self._feature.location.parts: + start = location.nofuzzy_start + end = location.nofuzzy_end + # if start > end and self.strand == -1: + # start, end = end, start + self.locations.append((start, end)) + bounds += [start, end] + self.type = str(self._feature.type) # Feature type + # TODO - Strand can vary with subfeatures (e.g. mixed strand tRNA) + if self._feature.strand is None: + # This is the SeqFeature default (None), but the drawing code + # only expects 0, +1 or -1. + self.strand = 0 + else: + self.strand = int(self._feature.strand) # Feature strand + if "color" in self._feature.qualifiers: # Artemis color (if present) + self.color = self._colortranslator.artemis_color( + self._feature.qualifiers["color"][0] + ) + self.name = self.type + for qualifier in self.name_qualifiers: + if qualifier in self._feature.qualifiers: + self.name = self._feature.qualifiers[qualifier][0] + break + # Note will be 0 to N for origin wrapping feature on genome of length N + self.start, self.end = min(bounds), max(bounds) + + def get_feature(self): + """Return the unwrapped Bio.SeqFeature object.""" + return self._feature + + def set_colour(self, colour): + """Backwards compatible variant of set_color(self, color) using UK spelling.""" + color = self._colortranslator.translate(colour) + self.color = color + + def set_color(self, color): + """Set the color in which the feature will be drawn. + + Arguments: + - color The color to draw the feature - either a colors.Color + object, an RGB tuple of floats, or an integer corresponding a + colors in colors.txt + + """ + # TODO - Make this into the set method for a color property? + color = self._colortranslator.translate(color) + self.color = color + + def __getattr__(self, name): + """Get attribute by name. + + If the Feature class doesn't have the attribute called for, + check in self._feature for it. + """ + return getattr(self._feature, name) # try to get the attribute from the feature + + +################################################################################ +# RUN AS SCRIPT +################################################################################ + +if __name__ == "__main__": + + # Test code + gdf = Feature() diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_FeatureSet.py b/code/lib/Bio/Graphics/GenomeDiagram/_FeatureSet.py new file mode 100644 index 0000000..4168a29 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_FeatureSet.py @@ -0,0 +1,210 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ +# +# Thanks to Peter Cock for the impetus to write the get_features() code to +# subselect Features. +# +################################################################################ + +"""FeatureSet module. + +Provides: + - FeatureSet - container for Feature objects + +For drawing capabilities, this module uses reportlab to draw and write +the diagram: http://www.reportlab.com +""" + + +# GenomeDiagram +from ._Feature import Feature + +# Builtins +import re + + +class FeatureSet: + """FeatureSet object.""" + + def __init__(self, set_id=None, name=None, parent=None): + """Create the object. + + Arguments: + - set_id: Unique id for the set + - name: String identifying the feature set + + """ + self.parent = parent + self.id = id # Unique id for the set + self.next_id = 0 # counter for unique feature ids + self.features = {} # Holds features, keyed by ID + self.name = name # String describing the set + + def add_feature(self, feature, **kwargs): + """Add a new feature. + + Arguments: + - feature: Bio.SeqFeature object + - kwargs: Keyword arguments for Feature. Named attributes + of the Feature + + Add a Bio.SeqFeature object to the diagram (will be stored + internally in a Feature wrapper). + """ + id = self.next_id # get id number + f = Feature(self, id, feature) + self.features[id] = f # add feature + for key in kwargs: + if key == "colour" or key == "color": + # Deal with "colour" as a special case by also mapping to color. + # If Feature.py used a python property we wouldn't need to call + # set_color explicitly. However, this is important to make sure + # every color gets mapped to a colors object - for example color + # numbers, or strings (may not matter for PDF, but does for PNG). + self.features[id].set_color(kwargs[key]) + continue + setattr(self.features[id], key, kwargs[key]) + self.next_id += 1 # increment next id + return f + + def del_feature(self, feature_id): + """Delete a feature. + + Arguments: + - feature_id: Unique id of the feature to delete + + Remove a feature from the set, indicated by its id. + """ + del self.features[feature_id] + + def set_all_features(self, attr, value): + """Set an attribute of all the features. + + Arguments: + - attr: An attribute of the Feature class + - value: The value to set that attribute to + + Set the passed attribute of all features in the set to the + passed value. + """ + for feature in self.features.values(): + if hasattr(feature, attr): + # If the feature has the attribute, set it to the passed value + setattr(feature, attr, value) + + # For backwards compatibility, we support both colour and color. + # As a quick hack, make "colour" set both "colour" and "color". + # if attr=="colour": + # self.set_all_feature("color",value) + + def get_features(self, attribute=None, value=None, comparator=None): + """Retrieve features. + + Arguments: + - attribute: String, attribute of a Feature object + - value: The value desired of the attribute + - comparator: String, how to compare the Feature attribute to the + passed value + + If no attribute or value is given, return a list of all features in the + feature set. If both an attribute and value are given, then depending + on the comparator, then a list of all features in the FeatureSet + matching (or not) the passed value will be returned. Allowed comparators + are: 'startswith', 'not', 'like'. + + The user is expected to make a responsible decision about which feature + attributes to use with which passed values and comparator settings. + """ + # If no attribute or value specified, return all features + if attribute is None or value is None: + return list(self.features.values()) + # If no comparator is specified, return all features where the attribute + # value matches that passed + if comparator is None: + return [ + feature + for feature in self.features.values() + if getattr(feature, attribute) == value + ] + # If the comparator is 'not', return all features where the attribute + # value does not match that passed + elif comparator == "not": + return [ + feature + for feature in self.features.values() + if getattr(feature, attribute) != value + ] + # If the comparator is 'startswith', return all features where the attribute + # value does not match that passed + elif comparator == "startswith": + return [ + feature + for feature in self.features.values() + if getattr(feature, attribute).startswith(value) + ] + # If the comparator is 'like', use a regular expression search to identify + # features + elif comparator == "like": + return [ + feature + for feature in self.features.values() + if re.search(value, getattr(feature, attribute)) + ] + # As a final option, just return an empty list + return [] + + def get_ids(self): + """Return a list of all ids for the feature set.""" + return list(self.features.keys()) + + def range(self): + """Return the lowest and highest base (or mark) numbers as a tuple.""" + lows, highs = [], [] + for feature in self.features.values(): + for start, end in feature.locations: + lows.append(start) + highs.append(end) + if len(lows) != 0 and len(highs) != 0: # Default in case there is + return (min(lows), max(highs)) # nothing in the set + return 0, 0 + + def to_string(self, verbose=0): + """Return a formatted string with information about the set. + + Arguments: + - verbose: Boolean indicating whether a short (default) or + complete account of the set is required + + """ + if not verbose: # Short account only required + return "%s" % self + else: # Long account desired + outstr = ["\n<%s: %s>" % (self.__class__, self.name)] + outstr.append("%d features" % len(self.features)) + for key in self.features: + outstr.append("feature: %s" % self.features[key]) + return "\n".join(outstr) + + def __len__(self): + """Return the number of features in the set.""" + return len(self.features) + + def __getitem__(self, key): + """Return a feature, keyed by id.""" + return self.features[key] + + def __str__(self): + """Return a formatted string with information about the feature set.""" + outstr = [ + "\n<%s: %s %d features>" % (self.__class__, self.name, len(self.features)) + ] + return "\n".join(outstr) diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Graph.py b/code/lib/Bio/Graphics/GenomeDiagram/_Graph.py new file mode 100644 index 0000000..7f99ef9 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_Graph.py @@ -0,0 +1,195 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# Revisions copyright 2008-2009 by Peter Cock. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ + +"""Graph module. + +Provides: + - GraphData - Contains data from which a graph will be drawn, and + information about its presentation + +For drawing capabilities, this module uses reportlab to draw and write +the diagram: http://www.reportlab.com +""" + +# ReportLab imports + +from reportlab.lib import colors + +from math import sqrt + + +class GraphData: + """Graph Data. + + Attributes: + - id Unique identifier for the data + - data Dictionary of describing the data, keyed by position + - name String describing the data + - style String ('bar', 'heat', 'line') describing how to draw the data + - poscolor colors.Color for drawing high (some styles) or all + values + - negcolor colors.Color for drawing low values (some styles) + - linewidth Int, thickness to draw the line in 'line' styles + + """ + + def __init__( + self, + id=None, + data=None, + name=None, + style="bar", + color=colors.lightgreen, + altcolor=colors.darkseagreen, + center=None, + colour=None, + altcolour=None, + ): + """Initialize. + + Arguments: + - id Unique ID for the graph + - data List of (position, value) tuples + - name String describing the graph + - style String describing the presentation style ('bar', 'line', + 'heat') + - color colors.Color describing the color to draw all or the + 'high' (some styles) values (overridden by backwards + compatible argument with UK spelling, colour). + - altcolor colors.Color describing the color to draw the 'low' + values (some styles only) (overridden by backwards + compatible argument with UK spelling, colour). + - center Value at which x-axis crosses y-axis. + + """ + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + if altcolour is not None: + altcolor = altcolour + + self.id = id # Unique identifier for the graph + self.data = {} # holds values, keyed by sequence position + if data is not None: + self.set_data(data) + self.name = name # Descriptive string + + # Attributes describing how the graph will be drawn + self.style = style # One of 'bar', 'heat' or 'line' + self.poscolor = color # Color to draw all, or 'high' values + self.negcolor = altcolor # Color to draw 'low' values + self.linewidth = 2 # linewidth to use in line graphs + self.center = center # value at which x-axis crosses y-axis + + def set_data(self, data): + """Add data as a list of (position, value) tuples.""" + for (pos, val) in data: # Fill data dictionary + self.data[pos] = val + + def get_data(self): + """Return data as a list of sorted (position, value) tuples.""" + data = [] + for xval in self.data: + yval = self.data[xval] + data.append((xval, yval)) + data.sort() + return data + + def add_point(self, point): + """Add a single point to the set of data as a (position, value) tuple.""" + pos, val = point + self.data[pos] = val + + def quartiles(self): + """Return (minimum, lowerQ, medianQ, upperQ, maximum) values as tuple.""" + data = sorted(self.data.values()) + datalen = len(data) + return ( + data[0], + data[datalen // 4], + data[datalen // 2], + data[3 * datalen // 4], + data[-1], + ) + + def range(self): + """Return range of data as (start, end) tuple. + + Returns the range of the data, i.e. its start and end points on + the genome as a (start, end) tuple. + """ + positions = sorted(self.data) # i.e. dict keys + # Return first and last positions in graph + # print(len(self.data)) + return (positions[0], positions[-1]) + + def mean(self): + """Return the mean value for the data points (float).""" + data = list(self.data.values()) + sum = 0.0 + for item in data: + sum += float(item) + return sum / len(data) + + def stdev(self): + """Return the sample standard deviation for the data (float).""" + data = list(self.data.values()) + m = self.mean() + runtotal = 0.0 + for entry in data: + runtotal += float((entry - m) ** 2) + # This is sample standard deviation; population stdev would involve + # division by len(data), rather than len(data)-1 + return sqrt(runtotal / (len(data) - 1)) + + def __len__(self): + """Return the number of points in the data set.""" + return len(self.data) + + def __getitem__(self, index): + """Return data value(s) at the given position. + + Given an integer representing position on the sequence + returns a float - the data value at the passed position. + + If a slice, returns graph data from the region as a list or + (position, value) tuples. Slices with step are not supported. + """ + if isinstance(index, int): + return self.data[index] + elif isinstance(index, slice): + # TODO - Why does it treat the end points both as inclusive? + # This doesn't match Python norms does it? + low = index.start + high = index.stop + if index.step is not None and index.step != 1: + raise ValueError + outlist = [] + for pos in sorted(self.data): + if pos >= low and pos <= high: + outlist.append((pos, self.data[pos])) + return outlist + else: + raise TypeError("Need an integer or a slice") + + def __str__(self): + """Return a string describing the graph data.""" + outstr = ["\nGraphData: %s, ID: %s" % (self.name, self.id)] + outstr.append("Number of points: %d" % len(self.data)) + outstr.append("Mean data value: %s" % self.mean()) + outstr.append("Sample SD: %.3f" % self.stdev()) + outstr.append( + "Minimum: %s\n1Q: %s\n2Q: %s\n3Q: %s\nMaximum: %s" % self.quartiles() + ) + outstr.append("Sequence Range: %s..%s" % self.range()) + return "\n".join(outstr) diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_GraphSet.py b/code/lib/Bio/Graphics/GenomeDiagram/_GraphSet.py new file mode 100644 index 0000000..d79e6ef --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_GraphSet.py @@ -0,0 +1,171 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# Revisions copyright 2008-2010 by Peter Cock. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ +# +# TODO: Make representation of Ymax and Ymin values at this level, so that +# calculation of graph/axis drawing is simplified + +"""GraphSet module. + +Provides: + - GraphSet - container for GraphData objects + +For drawing capabilities, this module uses reportlab to draw and write +the diagram: http://www.reportlab.com +""" + +# ReportLab imports + +from reportlab.lib import colors + +from ._Graph import GraphData + + +class GraphSet: + """Graph Set. + + Attributes: + - id Unique identifier for the set + - name String describing the set + + """ + + def __init__(self, name=None): + """Initialize. + + Arguments: + - name String identifying the graph set sensibly + + """ + self.id = id # Unique identifier for the set + self._next_id = 0 # Holds unique ids for graphs + self._graphs = {} # Holds graphs, keyed by unique id + self.name = name # Holds description of graph + + def new_graph( + self, + data, + name=None, + style="bar", + color=colors.lightgreen, + altcolor=colors.darkseagreen, + linewidth=1, + center=None, + colour=None, + altcolour=None, + centre=None, + ): + """Add a GraphData object to the diagram. + + Arguments: + - data List of (position, value) int tuples + - name String, description of the graph + - style String ('bar', 'heat', 'line') describing how the graph + will be drawn + - color colors.Color describing the color to draw all or 'high' + (some styles) data (overridden by backwards compatible + argument with UK spelling, colour). + - altcolor colors.Color describing the color to draw 'low' (some + styles) data (overridden by backwards compatible argument + with UK spelling, colour). + - linewidth Float describing linewidth for graph + - center Float setting the value at which the x-axis + crosses the y-axis (overridden by backwards + compatible argument with UK spelling, centre) + + Add a GraphData object to the diagram (will be stored internally). + """ + # Let the UK spelling (colour) override the USA spelling (color) + if colour is not None: + color = colour + if altcolour is not None: + altcolor = altcolour + if centre is not None: + center = centre + + id = self._next_id # get id number + graph = GraphData(id, data, name, style, color, altcolor, center) + graph.linewidth = linewidth + self._graphs[id] = graph # add graph data + self._next_id += 1 # increment next id + return graph + + def del_graph(self, graph_id): + """Remove a graph from the set, indicated by its id.""" + del self._graphs[graph_id] + + def get_graphs(self): + """Return list of all graphs in the graph set, sorted by id. + + Sorting is to ensure reliable stacking. + """ + return [self._graphs[id] for id in sorted(self._graphs)] + + def get_ids(self): + """Return a list of all ids for the graph set.""" + return list(self._graphs.keys()) + + def range(self): + """Return the lowest and highest base (or mark) numbers as a tuple.""" + lows, highs = [], [] + for graph in self._graphs.values(): + low, high = graph.range() + lows.append(low) + highs.append(high) + return (min(lows), max(highs)) + + def data_quartiles(self): + """Return (minimum, lowerQ, medianQ, upperQ, maximum) values as a tuple.""" + data = [] + for graph in self._graphs.values(): + data += list(graph.data.values()) + data.sort() + datalen = len(data) + return ( + data[0], + data[datalen / 4], + data[datalen / 2], + data[3 * datalen / 4], + data[-1], + ) + + def to_string(self, verbose=0): + """Return a formatted string with information about the set. + + Arguments: + - verbose - Flag indicating whether a short or complete account + of the set is required + + """ + if not verbose: + return "%s" % self + else: + outstr = ["\n<%s: %s>" % (self.__class__, self.name)] + outstr.append("%d graphs" % len(self._graphs)) + for key in self._graphs: + outstr.append("%s" % self._graphs[key]) + return "\n".join(outstr) + + def __len__(self): + """Return the number of graphs in the set.""" + return len(self._graphs) + + def __getitem__(self, key): + """Return a graph, keyed by id.""" + return self._graphs[key] + + def __str__(self): + """Return a formatted string with information about the feature set.""" + outstr = ["\n<%s: %s>" % (self.__class__, self.name)] + outstr.append("%d graphs" % len(self._graphs)) + outstr = "\n".join(outstr) + return outstr diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_LinearDrawer.py b/code/lib/Bio/Graphics/GenomeDiagram/_LinearDrawer.py new file mode 100644 index 0000000..36012ad --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_LinearDrawer.py @@ -0,0 +1,1580 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# Revisions copyright 2008-2009 by Peter Cock. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ + +"""Linear Drawer module. + +Provides: + - LinearDrawer - Drawing object for linear diagrams + +For drawing capabilities, this module uses reportlab to draw and write +the diagram: http://www.reportlab.com +""" + +# ReportLab imports + +from reportlab.graphics.shapes import Drawing, Line, String, Group, Polygon +from reportlab.lib import colors + +# GenomeDiagram imports +from ._AbstractDrawer import AbstractDrawer, draw_box, draw_arrow +from ._AbstractDrawer import draw_cut_corner_box, _stroke_and_fill_colors +from ._AbstractDrawer import intermediate_points, angle2trig, deduplicate +from ._FeatureSet import FeatureSet +from ._GraphSet import GraphSet + +from math import ceil + + +class LinearDrawer(AbstractDrawer): + """Linear Drawer. + + Inherits from: + - AbstractDrawer + + Attributes: + - tracklines Boolean for whether to draw lines delineating tracks + - pagesize Tuple describing the size of the page in pixels + - x0 Float X co-ord for leftmost point of drawable area + - xlim Float X co-ord for rightmost point of drawable area + - y0 Float Y co-ord for lowest point of drawable area + - ylim Float Y co-ord for topmost point of drawable area + - pagewidth Float pixel width of drawable area + - pageheight Float pixel height of drawable area + - xcenter Float X co-ord of center of drawable area + - ycenter Float Y co-ord of center of drawable area + - start Int, base to start drawing from + - end Int, base to stop drawing at + - length Int, size of sequence to be drawn + - fragments Int, number of fragments into which to divide the + drawn sequence + - fragment_size Float (0->1) the proportion of the fragment height to + draw in + - track_size Float (0->1) the proportion of the track height to + draw in + - drawing Drawing canvas + - drawn_tracks List of ints denoting which tracks are to be drawn + - current_track_level Int denoting which track is currently being + drawn + - fragment_height Float total fragment height in pixels + - fragment_bases Int total fragment length in bases + - fragment_lines Dictionary of top and bottom y-coords of fragment, + keyed by fragment number + - fragment_limits Dictionary of start and end bases of each fragment, + keyed by fragment number + - track_offsets Dictionary of number of pixels that each track top, + center and bottom is offset from the base of a fragment, keyed by track + - cross_track_links List of tuples each with four entries (track A, + feature A, track B, feature B) to be linked. + + """ + + def __init__( + self, + parent=None, + pagesize="A3", + orientation="landscape", + x=0.05, + y=0.05, + xl=None, + xr=None, + yt=None, + yb=None, + start=None, + end=None, + tracklines=0, + fragments=10, + fragment_size=None, + track_size=0.75, + cross_track_links=None, + ): + """Initialize. + + Arguments: + - parent Diagram object containing the data that the drawer draws + - pagesize String describing the ISO size of the image, or a tuple + of pixels + - orientation String describing the required orientation of the + final drawing ('landscape' or 'portrait') + - x Float (0->1) describing the relative size of the X + margins to the page + - y Float (0->1) describing the relative size of the Y + margins to the page + - xl Float (0->1) describing the relative size of the left X + margin to the page (overrides x) + - xl Float (0->1) describing the relative size of the left X + margin to the page (overrides x) + - xr Float (0->1) describing the relative size of the right X + margin to the page (overrides x) + - yt Float (0->1) describing the relative size of the top Y + margin to the page (overrides y) + - yb Float (0->1) describing the relative size of the lower Y + margin to the page (overrides y) + - start Int, the position to begin drawing the diagram at + - end Int, the position to stop drawing the diagram at + - tracklines Boolean flag to show (or not) lines delineating tracks + on the diagram + - fragments Int, the number of equal fragments into which the + sequence should be divided for drawing + - fragment_size Float(0->1) The proportion of the available height + for the fragment that should be taken up in drawing + - track_size The proportion of the available track height that + should be taken up in drawing + - cross_track_links List of tuples each with four entries (track A, + feature A, track B, feature B) to be linked. + """ + # Use the superclass' instantiation method + AbstractDrawer.__init__( + self, + parent, + pagesize, + orientation, + x, + y, + xl, + xr, + yt, + yb, + start, + end, + tracklines, + cross_track_links, + ) + + # Useful measurements on the page + self.fragments = fragments + if fragment_size is not None: + self.fragment_size = fragment_size + else: + if self.fragments == 1: + # For single fragments, default to full height + self.fragment_size = 1 + else: + # Otherwise keep a 10% gap between fragments + self.fragment_size = 0.9 + self.track_size = track_size + + def draw(self): + """Draw a linear diagram of the data in the parent Diagram object.""" + # Instantiate the drawing canvas + self.drawing = Drawing(self.pagesize[0], self.pagesize[1]) + + feature_elements = [] # holds feature elements + feature_labels = [] # holds feature labels + greytrack_bgs = [] # holds track background + greytrack_labels = [] # holds track foreground labels + scale_axes = [] # holds scale axes + scale_labels = [] # holds scale axis labels + + # Get the tracks to be drawn + self.drawn_tracks = self._parent.get_drawn_levels() + + # Set fragment and track sizes + self.init_fragments() + self.set_track_heights() + + # Go through each track in the parent (if it is to be drawn) one by + # one and collate the data as drawing elements + for track_level in self.drawn_tracks: # only use tracks to be drawn + self.current_track_level = track_level # establish track level + track = self._parent[track_level] # get the track at that level + gbgs, glabels = self.draw_greytrack(track) # get greytrack elements + greytrack_bgs.append(gbgs) + greytrack_labels.append(glabels) + features, flabels = self.draw_track(track) # get feature and graph elements + feature_elements.append(features) + feature_labels.append(flabels) + if track.scale: + axes, slabels = self.draw_scale(track) # get scale elements + scale_axes.append(axes) + scale_labels.append(slabels) + + feature_cross_links = [] + for cross_link_obj in self.cross_track_links: + cross_link_elements = self.draw_cross_link(cross_link_obj) + if cross_link_elements: + feature_cross_links.append(cross_link_elements) + + # Groups listed in order of addition to page (from back to front) + # Draw track backgrounds + # Draw feature cross track links + # Draw features and graphs + # Draw scale axes + # Draw scale labels + # Draw feature labels + # Draw track labels + element_groups = [ + greytrack_bgs, + feature_cross_links, + feature_elements, + scale_axes, + scale_labels, + feature_labels, + greytrack_labels, + ] + for element_group in element_groups: + for element_list in element_group: + [self.drawing.add(element) for element in element_list] + + if self.tracklines: # Draw test tracks over top of diagram + self.draw_test_tracks() + + def init_fragments(self): + """Initialize useful values for positioning diagram elements.""" + # Set basic heights, lengths etc + self.fragment_height = ( + 1.0 * self.pageheight / self.fragments + ) # total fragment height in pixels + self.fragment_bases = ceil( + 1.0 * self.length / self.fragments + ) # fragment length in bases + + # Key fragment base and top lines by fragment number + # Holds bottom and top line locations of fragments, keyed by fragment number + self.fragment_lines = {} + # Number of pixels to crop the fragment: + fragment_crop = (1 - self.fragment_size) / 2 + fragy = self.ylim # Holder for current absolute fragment base + for fragment in range(self.fragments): + fragtop = fragy - fragment_crop * self.fragment_height # top - crop + fragbtm = ( + fragy - (1 - fragment_crop) * self.fragment_height + ) # bottom + crop + self.fragment_lines[fragment] = (fragbtm, fragtop) + fragy -= self.fragment_height # next fragment base + + # Key base starts and ends for each fragment by fragment number + self.fragment_limits = {} # Holds first and last base positions in a fragment + fragment_step = self.fragment_bases # bases per fragment + fragment_count = 0 + # Add start and end positions for each fragment to dictionary + for marker in range(int(self.start), int(self.end), int(fragment_step)): + self.fragment_limits[fragment_count] = (marker, marker + fragment_step) + fragment_count += 1 + + def set_track_heights(self): + """Set track heights. + + Since tracks may not be of identical heights, the bottom and top + offsets of each track relative to the fragment top and bottom is + stored in a dictionary - self.track_offsets, keyed by track number. + """ + bot_track = min(min(self.drawn_tracks), 1) + top_track = max(self.drawn_tracks) # The 'highest' track number to draw + + trackunit_sum = 0 # Total number of 'units' for the tracks + trackunits = {} # The start and end units for each track, keyed by track number + heightholder = 0 # placeholder variable + for track in range(bot_track, top_track + 1): # for all track numbers to 'draw' + try: + trackheight = self._parent[track].height # Get track height + except Exception: # TODO: IndexError? + trackheight = 1 # ...or default to 1 + trackunit_sum += trackheight # increment total track unit height + trackunits[track] = (heightholder, heightholder + trackheight) + heightholder += trackheight # move to next height + trackunit_height = ( + 1.0 * self.fragment_height * self.fragment_size / trackunit_sum + ) + + # Calculate top and bottom offsets for each track, relative to fragment + # base + track_offsets = {} # The offsets from fragment base for each track + track_crop = ( + trackunit_height * (1 - self.track_size) / 2.0 + ) # 'step back' in pixels + assert track_crop >= 0 + for track in trackunits: + top = trackunits[track][1] * trackunit_height - track_crop # top offset + btm = trackunits[track][0] * trackunit_height + track_crop # bottom offset + ctr = btm + (top - btm) / 2.0 # center offset + track_offsets[track] = (btm, ctr, top) + self.track_offsets = track_offsets + + def draw_test_tracks(self): + """Draw test tracks. + + Draw red lines indicating the top and bottom of each fragment, + and blue ones indicating tracks to be drawn. + """ + # Add lines for each fragment + for fbtm, ftop in self.fragment_lines.values(): + self.drawing.add( + Line(self.x0, ftop, self.xlim, ftop, strokeColor=colors.red) + ) # top line + self.drawing.add( + Line(self.x0, fbtm, self.xlim, fbtm, strokeColor=colors.red) + ) # bottom line + + # Add track lines for this fragment - but only for drawn tracks + for track in self.drawn_tracks: + trackbtm = fbtm + self.track_offsets[track][0] + trackctr = fbtm + self.track_offsets[track][1] + tracktop = fbtm + self.track_offsets[track][2] + self.drawing.add( + Line( + self.x0, tracktop, self.xlim, tracktop, strokeColor=colors.blue + ) + ) # top line + self.drawing.add( + Line( + self.x0, trackctr, self.xlim, trackctr, strokeColor=colors.green + ) + ) # center line + self.drawing.add( + Line( + self.x0, trackbtm, self.xlim, trackbtm, strokeColor=colors.blue + ) + ) # bottom line + + def draw_track(self, track): + """Draw track. + + Arguments: + - track Track object + + Returns a tuple (list of elements in the track, list of labels in + the track). + """ + track_elements = [] # Holds elements from features and graphs + track_labels = [] # Holds labels from features and graphs + + # Distribution dictionary for dealing with different set types + set_methods = {FeatureSet: self.draw_feature_set, GraphSet: self.draw_graph_set} + + for set in track.get_sets(): # Draw the feature or graph sets + elements, labels = set_methods[set.__class__](set) + track_elements += elements + track_labels += labels + return track_elements, track_labels + + def draw_tick(self, tickpos, ctr, ticklen, track, draw_label): + """Draw tick. + + Arguments: + - tickpos Int, position of the tick on the sequence + - ctr Float, Y co-ord of the center of the track + - ticklen How long to draw the tick + - track Track, the track the tick is drawn on + - draw_label Boolean, write the tick label? + + Returns a drawing element that is the tick on the scale + """ + if self.start >= tickpos and tickpos >= self.end: + raise RuntimeError( + "Tick at %i, but showing %i to %i" % (tickpos, self.start, self.end) + ) + if not ( + (track.start is None or track.start <= tickpos) + and (track.end is None or tickpos <= track.end) + ): + raise RuntimeError( + "Tick at %i, but showing %r to %r for track" + % (tickpos, track.start, track.end) + ) + fragment, tickx = self.canvas_location(tickpos) # Tick co-ordinates + assert fragment >= 0, "Fragment %i, tickpos %i" % (fragment, tickpos) + tctr = ctr + self.fragment_lines[fragment][0] # Center line of the track + tickx += self.x0 # Tick X co-ord + ticktop = tctr + ticklen # Y co-ord of tick top + tick = Line(tickx, tctr, tickx, ticktop, strokeColor=track.scale_color) + if draw_label: # Put tick position on as label + if track.scale_format == "SInt": + if tickpos >= 1000000: + tickstring = str(tickpos // 1000000) + " Mbp" + elif tickpos >= 1000: + tickstring = str(tickpos // 1000) + " Kbp" + else: + tickstring = str(tickpos) + else: + tickstring = str(tickpos) + label = String( + 0, + 0, + tickstring, # Make label string + fontName=track.scale_font, + fontSize=track.scale_fontsize, + fillColor=track.scale_color, + ) + labelgroup = Group(label) + rotation = angle2trig(track.scale_fontangle) + labelgroup.transform = ( + rotation[0], + rotation[1], + rotation[2], + rotation[3], + tickx, + ticktop, + ) + else: + labelgroup = None + return tick, labelgroup + + def draw_scale(self, track): + """Draw scale. + + Argument: + - track Track object + + Returns a tuple of (list of elements in the scale, list of labels + in the scale). + """ + scale_elements = [] # Holds axes and ticks + scale_labels = [] # Holds labels + + if not track.scale: # No scale required, exit early + return [], [] + + # Get track location + btm, ctr, top = self.track_offsets[self.current_track_level] + trackheight = top - ctr + + # For each fragment, draw the scale for this track + start, end = self._current_track_start_end() + start_f, start_x = self.canvas_location(start) + end_f, end_x = self.canvas_location(end) + + for fragment in range(start_f, end_f + 1): + tbtm = btm + self.fragment_lines[fragment][0] + tctr = ctr + self.fragment_lines[fragment][0] + ttop = top + self.fragment_lines[fragment][0] + # X-axis + if fragment == start_f: + x_left = start_x + else: + x_left = 0 + if fragment == end_f: + x_right = end_x + # Y-axis end marker + scale_elements.append( + Line( + self.x0 + x_right, + tbtm, + self.x0 + x_right, + ttop, + strokeColor=track.scale_color, + ) + ) + else: + x_right = self.xlim - self.x0 + scale_elements.append( + Line( + self.x0 + x_left, + tctr, + self.x0 + x_right, + tctr, + strokeColor=track.scale_color, + ) + ) + # Y-axis start marker + scale_elements.append( + Line( + self.x0 + x_left, + tbtm, + self.x0 + x_left, + ttop, + strokeColor=track.scale_color, + ) + ) + + start, end = self._current_track_start_end() + if track.scale_ticks: # Ticks are required on the scale + # Draw large ticks + # I want the ticks to be consistently positioned relative to + # the start of the sequence (position 0), not relative to the + # current viewpoint (self.start and self.end) + + ticklen = track.scale_largeticks * trackheight + tickiterval = int(track.scale_largetick_interval) + # Note that we could just start the list of ticks using + # range(0,self.end,tickinterval) and the filter out the + # ones before self.start - but this seems wasteful. + # Using tickiterval * (self.start//tickiterval) is a shortcut. + for tickpos in range( + tickiterval * (self.start // tickiterval), int(self.end), tickiterval + ): + if tickpos <= start or end <= tickpos: + continue + tick, label = self.draw_tick( + tickpos, ctr, ticklen, track, track.scale_largetick_labels + ) + scale_elements.append(tick) + if label is not None: # If there's a label, add it + scale_labels.append(label) + # Draw small ticks + ticklen = track.scale_smallticks * trackheight + tickiterval = int(track.scale_smalltick_interval) + for tickpos in range( + tickiterval * (self.start // tickiterval), int(self.end), tickiterval + ): + if tickpos <= start or end <= tickpos: + continue + tick, label = self.draw_tick( + tickpos, ctr, ticklen, track, track.scale_smalltick_labels + ) + scale_elements.append(tick) + if label is not None: # If there's a label, add it + scale_labels.append(label) + + # Check to see if the track contains a graph - if it does, get the + # minimum and maximum values, and put them on the scale Y-axis + if track.axis_labels: + for set in track.get_sets(): # Check all sets... + if set.__class__ is GraphSet: # ...for a graph set + graph_label_min = [] + graph_label_mid = [] + graph_label_max = [] + for graph in set.get_graphs(): + quartiles = graph.quartiles() + minval, maxval = quartiles[0], quartiles[4] + if graph.center is None: + midval = (maxval + minval) / 2.0 + graph_label_min.append("%.3f" % minval) + graph_label_max.append("%.3f" % maxval) + else: + diff = max((graph.center - minval), (maxval - graph.center)) + minval = graph.center - diff + maxval = graph.center + diff + midval = graph.center + graph_label_mid.append("%.3f" % midval) + graph_label_min.append("%.3f" % minval) + graph_label_max.append("%.3f" % maxval) + for fragment in range( + start_f, end_f + 1 + ): # Add to all used fragment axes + tbtm = btm + self.fragment_lines[fragment][0] + tctr = ctr + self.fragment_lines[fragment][0] + ttop = top + self.fragment_lines[fragment][0] + if fragment == start_f: + x_left = start_x + else: + x_left = 0 + for val, pos in [ + (";".join(graph_label_min), tbtm), + (";".join(graph_label_max), ttop), + (";".join(graph_label_mid), tctr), + ]: + label = String( + 0, + 0, + val, + fontName=track.scale_font, + fontSize=track.scale_fontsize, + fillColor=track.scale_color, + ) + labelgroup = Group(label) + rotation = angle2trig(track.scale_fontangle) + labelgroup.transform = ( + rotation[0], + rotation[1], + rotation[2], + rotation[3], + self.x0 + x_left, + pos, + ) + scale_labels.append(labelgroup) + + return scale_elements, scale_labels + + def draw_greytrack(self, track): + """Draw greytrack. + + Arguments: + - track Track object + + Put in a grey background to the current track in all fragments, + if track specifies that we should. + """ + greytrack_bgs = [] # Holds grey track backgrounds + greytrack_labels = [] # Holds grey foreground labels + + if not track.greytrack: # No greytrack required, return early + return [], [] + + # Get track location + btm, ctr, top = self.track_offsets[self.current_track_level] + + start, end = self._current_track_start_end() + start_fragment, start_offset = self.canvas_location(start) + end_fragment, end_offset = self.canvas_location(end) + + # Add greytrack to all fragments for this track + for fragment in range(start_fragment, end_fragment + 1): + tbtm = btm + self.fragment_lines[fragment][0] + tctr = ctr + self.fragment_lines[fragment][0] + ttop = top + self.fragment_lines[fragment][0] + if fragment == start_fragment: + x1 = self.x0 + start_offset + else: + x1 = self.x0 + if fragment == end_fragment: + x2 = self.x0 + end_offset + else: + x2 = self.xlim + box = draw_box( + (x1, tbtm), (x2, ttop), colors.Color(0.96, 0.96, 0.96) # Grey track bg + ) # is just a box + greytrack_bgs.append(box) + + if track.greytrack_labels: # If labels are required + # # how far apart should they be? + labelstep = self.pagewidth / track.greytrack_labels + label = String( + 0, + 0, + track.name, # label contents + fontName=track.greytrack_font, + fontSize=track.greytrack_fontsize, + fillColor=track.greytrack_fontcolor, + ) + # Create a new labelgroup at each position the label is required + for x in range(int(self.x0), int(self.xlim), int(labelstep)): + if fragment == start_fragment and x < start_offset: + continue + if ( + fragment == end_fragment + and end_offset < x + label.getBounds()[2] + ): + continue + labelgroup = Group(label) + rotation = angle2trig(track.greytrack_font_rotation) + labelgroup.transform = ( + rotation[0], + rotation[1], + rotation[2], + rotation[3], + x, + tbtm, + ) + if not self.xlim - x <= labelstep: + # Don't overlap the end of the track + greytrack_labels.append(labelgroup) + + return greytrack_bgs, greytrack_labels + + def draw_feature_set(self, set): + """Draw feature set. + + Arguments: + - set FeatureSet object + + Returns a tuple (list of elements describing features, list of + labels for elements). + """ + # print("draw feature set") + feature_elements = [] # Holds diagram elements belonging to the features + label_elements = [] # Holds diagram elements belonging to feature labels + + # Collect all the elements for the feature set + for feature in set.get_features(): + if self.is_in_bounds(feature.start) or self.is_in_bounds(feature.end): + features, labels = self.draw_feature(feature) # get elements and labels + feature_elements += features + label_elements += labels + + return feature_elements, label_elements + + def draw_feature(self, feature): + """Draw feature. + + Arguments: + - feature Feature containing location info + + Returns tuple of (list of elements describing single feature, list + of labels for those elements). + """ + if feature.hide: # Feature hidden, don't draw it... + return [], [] + + feature_elements = [] # Holds diagram elements belonging to the feature + label_elements = [] # Holds labels belonging to the feature + + start, end = self._current_track_start_end() + # A single feature may be split into subfeatures, so loop over them + for locstart, locend in feature.locations: + if locend < start: + continue + locstart = max(locstart, start) + if end < locstart: + continue + locend = min(locend, end) + feature_boxes = self.draw_feature_location(feature, locstart, locend) + for box, label in feature_boxes: + feature_elements.append(box) + if label is not None: + label_elements.append(label) + + return feature_elements, label_elements + + def draw_feature_location(self, feature, locstart, locend): + """Draw feature location.""" + feature_boxes = [] + # Get start and end positions for feature/subfeatures + start_fragment, start_offset = self.canvas_location(locstart) + end_fragment, end_offset = self.canvas_location(locend) + # print("start_fragment, start_offset", start_fragment, start_offset) + # print("end_fragment, end_offset", end_fragment, end_offset) + # print("start, end", locstart, locend) + + # Note that there is a strange situation where a feature may be in + # several parts, and one or more of those parts may end up being + # drawn on a non-existent fragment. So we check that the start and + # end fragments do actually exist in terms of the drawing + allowed_fragments = list(self.fragment_limits.keys()) + if start_fragment in allowed_fragments and end_fragment in allowed_fragments: + # print(feature.name, feature.start, feature.end, start_offset, end_offset) + if start_fragment == end_fragment: # Feature is found on one fragment + feature_box, label = self.get_feature_sigil( + feature, start_offset, end_offset, start_fragment + ) + feature_boxes.append((feature_box, label)) + # feature_elements.append(feature_box) + # if label is not None: # There is a label for the feature + # label_elements.append(label) + else: # Feature is split over two or more fragments + fragment = start_fragment + start = start_offset + # The bit that runs up to the end of the first fragment, + # and any bits that subsequently span whole fragments + while self.fragment_limits[fragment][1] < locend: + # print(fragment, self.fragment_limits[fragment][1], locend) + feature_box, label = self.get_feature_sigil( + feature, start, self.pagewidth, fragment + ) + + fragment += 1 # move to next fragment + start = 0 # start next sigil from start of fragment + feature_boxes.append((feature_box, label)) + # feature_elements.append(feature_box) + # if label is not None: # There's a label for the feature + # label_elements.append(label) + # The last bit of the feature + # print(locend, self.end, fragment) + # print(self.fragment_bases, self.length) + feature_box, label = self.get_feature_sigil( + feature, 0, end_offset, fragment + ) + feature_boxes.append((feature_box, label)) + # if locstart > locend: + # print(locstart, locend, feature.strand, feature_boxes, feature.name) + return feature_boxes + + def draw_cross_link(self, cross_link): + """Draw cross-link between two features.""" + startA = cross_link.startA + startB = cross_link.startB + endA = cross_link.endA + endB = cross_link.endB + + if not self.is_in_bounds(startA) and not self.is_in_bounds(endA): + return None + if not self.is_in_bounds(startB) and not self.is_in_bounds(endB): + return None + + if startA < self.start: + startA = self.start + if startB < self.start: + startB = self.start + if self.end < endA: + endA = self.end + if self.end < endB: + endB = self.end + + trackobjA = cross_link._trackA(list(self._parent.tracks.values())) + trackobjB = cross_link._trackB(list(self._parent.tracks.values())) + assert trackobjA is not None + assert trackobjB is not None + if trackobjA == trackobjB: + raise NotImplementedError() + + if trackobjA.start is not None: + if endA < trackobjA.start: + return + startA = max(startA, trackobjA.start) + if trackobjA.end is not None: + if trackobjA.end < startA: + return + endA = min(endA, trackobjA.end) + if trackobjB.start is not None: + if endB < trackobjB.start: + return + startB = max(startB, trackobjB.start) + if trackobjB.end is not None: + if trackobjB.end < startB: + return + endB = min(endB, trackobjB.end) + + for track_level in self._parent.get_drawn_levels(): + track = self._parent[track_level] + if track == trackobjA: + trackA = track_level + if track == trackobjB: + trackB = track_level + if trackA == trackB: + raise NotImplementedError() + + strokecolor, fillcolor = _stroke_and_fill_colors( + cross_link.color, cross_link.border + ) + + allowed_fragments = list(self.fragment_limits.keys()) + + start_fragmentA, start_offsetA = self.canvas_location(startA) + end_fragmentA, end_offsetA = self.canvas_location(endA) + if ( + start_fragmentA not in allowed_fragments + or end_fragmentA not in allowed_fragments + ): + return + + start_fragmentB, start_offsetB = self.canvas_location(startB) + end_fragmentB, end_offsetB = self.canvas_location(endB) + if ( + start_fragmentB not in allowed_fragments + or end_fragmentB not in allowed_fragments + ): + return + + # TODO - Better drawing of flips when split between fragments + + answer = [] + for fragment in range( + min(start_fragmentA, start_fragmentB), max(end_fragmentA, end_fragmentB) + 1 + ): + btmA, ctrA, topA = self.track_offsets[trackA] + btmA += self.fragment_lines[fragment][0] + ctrA += self.fragment_lines[fragment][0] + topA += self.fragment_lines[fragment][0] + + btmB, ctrB, topB = self.track_offsets[trackB] + btmB += self.fragment_lines[fragment][0] + ctrB += self.fragment_lines[fragment][0] + topB += self.fragment_lines[fragment][0] + + if self.fragment_limits[fragment][1] < endA: + xAe = self.x0 + self.pagewidth + crop_rightA = True + else: + xAe = self.x0 + end_offsetA + crop_rightA = False + if self.fragment_limits[fragment][1] < endB: + xBe = self.x0 + self.pagewidth + crop_rightB = True + else: + xBe = self.x0 + end_offsetB + crop_rightB = False + + if fragment < start_fragmentA: + xAs = self.x0 + self.pagewidth + xAe = xAs + crop_leftA = False + elif fragment == start_fragmentA: + xAs = self.x0 + start_offsetA + crop_leftA = False + else: + xAs = self.x0 + crop_leftA = True + + if fragment < start_fragmentB: + xBs = self.x0 + self.pagewidth + xBe = xBs + crop_leftB = False + elif fragment == start_fragmentB: + xBs = self.x0 + start_offsetB + crop_leftB = False + else: + xBs = self.x0 + crop_leftB = True + + if ctrA < ctrB: + yA = topA + yB = btmB + else: + yA = btmA + yB = topB + + if fragment < start_fragmentB or end_fragmentB < fragment: + if cross_link.flip: + # Just draw A as a triangle to left/right + if fragment < start_fragmentB: + extra = [self.x0 + self.pagewidth, 0.5 * (yA + yB)] + else: + extra = [self.x0, 0.5 * (yA + yB)] + else: + if fragment < start_fragmentB: + extra = [ + self.x0 + self.pagewidth, + 0.7 * yA + 0.3 * yB, + self.x0 + self.pagewidth, + 0.3 * yA + 0.7 * yB, + ] + else: + extra = [ + self.x0, + 0.3 * yA + 0.7 * yB, + self.x0, + 0.7 * yA + 0.3 * yB, + ] + answer.append( + Polygon( + deduplicate([xAs, yA, xAe, yA] + extra), + strokeColor=strokecolor, + fillColor=fillcolor, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + ) + ) + elif fragment < start_fragmentA or end_fragmentA < fragment: + if cross_link.flip: + # Just draw B as a triangle to left + if fragment < start_fragmentA: + extra = [self.x0 + self.pagewidth, 0.5 * (yA + yB)] + else: + extra = [self.x0, 0.5 * (yA + yB)] + else: + if fragment < start_fragmentA: + extra = [ + self.x0 + self.pagewidth, + 0.3 * yA + 0.7 * yB, + self.x0 + self.pagewidth, + 0.7 * yA + 0.3 * yB, + ] + else: + extra = [ + self.x0, + 0.7 * yA + 0.3 * yB, + self.x0, + 0.3 * yA + 0.7 * yB, + ] + answer.append( + Polygon( + deduplicate([xBs, yB, xBe, yB] + extra), + strokeColor=strokecolor, + fillColor=fillcolor, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + ) + ) + elif cross_link.flip and ( + (crop_leftA and not crop_rightA) or (crop_leftB and not crop_rightB) + ): + # On left end of fragment... force "crossing" to margin + answer.append( + Polygon( + deduplicate( + [ + xAs, + yA, + xAe, + yA, + self.x0, + 0.5 * (yA + yB), + xBe, + yB, + xBs, + yB, + ] + ), + strokeColor=strokecolor, + fillColor=fillcolor, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + ) + ) + elif cross_link.flip and ( + (crop_rightA and not crop_leftA) or (crop_rightB and not crop_leftB) + ): + # On right end... force "crossing" to margin + answer.append( + Polygon( + deduplicate( + [ + xAs, + yA, + xAe, + yA, + xBe, + yB, + xBs, + yB, + self.x0 + self.pagewidth, + 0.5 * (yA + yB), + ] + ), + strokeColor=strokecolor, + fillColor=fillcolor, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + ) + ) + elif cross_link.flip: + answer.append( + Polygon( + deduplicate([xAs, yA, xAe, yA, xBs, yB, xBe, yB]), + strokeColor=strokecolor, + fillColor=fillcolor, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + ) + ) + else: + answer.append( + Polygon( + deduplicate([xAs, yA, xAe, yA, xBe, yB, xBs, yB]), + strokeColor=strokecolor, + fillColor=fillcolor, + # default is mitre/miter which can stick out too much: + strokeLineJoin=1, # 1=round + strokewidth=0, + ) + ) + return answer + + def get_feature_sigil(self, feature, x0, x1, fragment, **kwargs): + """Get feature sigil. + + Arguments: + - feature Feature object + - x0 Start X co-ordinate on diagram + - x1 End X co-ordinate on diagram + - fragment The fragment on which the feature appears + + Returns a drawable indicator of the feature, and any required label + for it. + """ + # Establish co-ordinates for drawing + x0, x1 = self.x0 + x0, self.x0 + x1 + btm, ctr, top = self.track_offsets[self.current_track_level] + try: + btm += self.fragment_lines[fragment][0] + ctr += self.fragment_lines[fragment][0] + top += self.fragment_lines[fragment][0] + except Exception: # Only called if the method screws up big time + print("We've got a screw-up") + print("%s %s" % (self.start, self.end)) + print(self.fragment_bases) + print("%r %r" % (x0, x1)) + for locstart, locend in feature.locations: + print(self.canvas_location(locstart)) + print(self.canvas_location(locend)) + print("FEATURE\n%s" % feature) + raise + + # Distribution dictionary for various ways of drawing the feature + draw_methods = { + "BOX": self._draw_sigil_box, + "ARROW": self._draw_sigil_arrow, + "BIGARROW": self._draw_sigil_big_arrow, + "OCTO": self._draw_sigil_octo, + "JAGGY": self._draw_sigil_jaggy, + } + + method = draw_methods[feature.sigil] + kwargs["head_length_ratio"] = feature.arrowhead_length + kwargs["shaft_height_ratio"] = feature.arrowshaft_height + + # Support for clickable links... needs ReportLab 2.4 or later + # which added support for links in SVG output. + if hasattr(feature, "url"): + kwargs["hrefURL"] = feature.url + kwargs["hrefTitle"] = feature.name + + # Get sigil for the feature, give it the bounding box straddling + # the axis (it decides strand specific placement) + sigil = method( + btm, + ctr, + top, + x0, + x1, + strand=feature.strand, + color=feature.color, + border=feature.border, + **kwargs + ) + + if feature.label_strand: + strand = feature.label_strand + else: + strand = feature.strand + if feature.label: # Feature requires a label + label = String( + 0, + 0, + feature.name, + fontName=feature.label_font, + fontSize=feature.label_size, + fillColor=feature.label_color, + ) + labelgroup = Group(label) + # Feature is on top, or covers both strands (location affects + # the height and rotation of the label) + if strand != -1: + rotation = angle2trig(feature.label_angle) + if feature.label_position in ("end", "3'", "right"): + pos = x1 + elif feature.label_position in ("middle", "center", "centre"): + pos = (x1 + x0) / 2.0 + else: + # Default to start, i.e. 'start', "5'", 'left' + pos = x0 + labelgroup.transform = ( + rotation[0], + rotation[1], + rotation[2], + rotation[3], + pos, + top, + ) + else: # Feature on bottom strand + rotation = angle2trig(feature.label_angle + 180) + if feature.label_position in ("end", "3'", "right"): + pos = x0 + elif feature.label_position in ("middle", "center", "centre"): + pos = (x1 + x0) / 2.0 + else: + # Default to start, i.e. 'start', "5'", 'left' + pos = x1 + labelgroup.transform = ( + rotation[0], + rotation[1], + rotation[2], + rotation[3], + pos, + btm, + ) + else: + labelgroup = None + return sigil, labelgroup + + def draw_graph_set(self, set): + """Draw graph set. + + Arguments: + - set GraphSet object + + Returns tuple (list of graph elements, list of graph labels). + """ + # print('draw graph set') + elements = [] # Holds graph elements + + # Distribution dictionary for how to draw the graph + style_methods = { + "line": self.draw_line_graph, + "heat": self.draw_heat_graph, + "bar": self.draw_bar_graph, + } + + for graph in set.get_graphs(): + elements += style_methods[graph.style](graph) + + return elements, [] + + def draw_line_graph(self, graph): + """Return a line graph as a list of drawable elements. + + Arguments: + - graph Graph object + + """ + # print('\tdraw_line_graph') + line_elements = [] # Holds drawable elements + + # Get graph data + data_quartiles = graph.quartiles() + minval, maxval = data_quartiles[0], data_quartiles[4] + btm, ctr, top = self.track_offsets[self.current_track_level] + trackheight = 0.5 * (top - btm) + datarange = maxval - minval + if datarange == 0: + datarange = trackheight + + start, end = self._current_track_start_end() + data = graph[start:end] + + # midval is the value at which the x-axis is plotted, and is the + # central ring in the track + if graph.center is None: + midval = (maxval + minval) / 2.0 + else: + midval = graph.center + # Whichever is the greatest difference: max-midval or min-midval, is + # taken to specify the number of pixel units resolved along the + # y-axis + resolution = max((midval - minval), (maxval - midval)) + + # Start from first data point + pos, val = data[0] + lastfrag, lastx = self.canvas_location(pos) + lastx += self.x0 # Start xy co-ords + lasty = ( + trackheight * (val - midval) / resolution + + self.fragment_lines[lastfrag][0] + + ctr + ) + lastval = val + # Add a series of lines linking consecutive data points + for pos, val in data: + frag, x = self.canvas_location(pos) + x += self.x0 # next xy co-ords + y = ( + trackheight * (val - midval) / resolution + + self.fragment_lines[frag][0] + + ctr + ) + if frag == lastfrag: # Points on the same fragment: draw the line + line_elements.append( + Line( + lastx, + lasty, + x, + y, + strokeColor=graph.poscolor, + strokeWidth=graph.linewidth, + ) + ) + else: # Points not on the same fragment, so interpolate + tempy = ( + trackheight * (val - midval) / resolution + + self.fragment_lines[lastfrag][0] + + ctr + ) + line_elements.append( + Line( + lastx, + lasty, + self.xlim, + tempy, + strokeColor=graph.poscolor, + strokeWidth=graph.linewidth, + ) + ) + tempy = ( + trackheight * (val - midval) / resolution + + self.fragment_lines[frag][0] + + ctr + ) + line_elements.append( + Line( + self.x0, + tempy, + x, + y, + strokeColor=graph.poscolor, + strokeWidth=graph.linewidth, + ) + ) + lastfrag, lastx, lasty, lastval = frag, x, y, val + + return line_elements + + def draw_heat_graph(self, graph): + """Return a list of drawable elements for the heat graph.""" + # print('\tdraw_heat_graph') + # At each point contained in the graph data, we draw a box that is the + # full height of the track, extending from the midpoint between the + # previous and current data points to the midpoint between the current + # and next data points + heat_elements = [] # Holds drawable elements for the graph + + # Get graph data and information + data_quartiles = graph.quartiles() + minval, maxval = data_quartiles[0], data_quartiles[4] + midval = (maxval + minval) / 2.0 # mid is the value at the X-axis + btm, ctr, top = self.track_offsets[self.current_track_level] + trackheight = top - btm + + start, end = self._current_track_start_end() + data = intermediate_points(start, end, graph[start:end]) + + if not data: + return [] + + # Create elements on the graph, indicating a large positive value by + # the graph's poscolor, and a large negative value by the graph's + # negcolor attributes + for pos0, pos1, val in data: + # assert start <= pos0 <= pos1 <= end + fragment0, x0 = self.canvas_location(pos0) + fragment1, x1 = self.canvas_location(pos1) + x0, x1 = self.x0 + x0, self.x0 + x1 # account for margin + # print('x1 before:', x1) + + # Calculate the heat color, based on the differential between + # the value and the median value + heat = colors.linearlyInterpolatedColor( + graph.poscolor, graph.negcolor, maxval, minval, val + ) + + # Draw heat box + if fragment0 == fragment1: # Box is contiguous on one fragment + if pos1 >= self.fragment_limits[fragment0][1]: + x1 = self.xlim + ttop = top + self.fragment_lines[fragment0][0] + tbtm = btm + self.fragment_lines[fragment0][0] + # print('equal', pos0, pos1, val) + # print(pos0, pos1, fragment0, fragment1) + heat_elements.append( + draw_box((x0, tbtm), (x1, ttop), color=heat, border=None) + ) + else: # box is split over two or more fragments + # if pos0 >= self.fragment_limits[fragment0][0]: + # fragment0 += 1 + fragment = fragment0 + start_x = x0 + while self.fragment_limits[fragment][1] <= pos1: + # print(pos0, self.fragment_limits[fragment][1], pos1) + ttop = top + self.fragment_lines[fragment][0] + tbtm = btm + self.fragment_lines[fragment][0] + heat_elements.append( + draw_box( + (start_x, tbtm), (self.xlim, ttop), color=heat, border=None + ) + ) + fragment += 1 + start_x = self.x0 + ttop = top + self.fragment_lines[fragment][0] + tbtm = btm + self.fragment_lines[fragment][0] + # Add the last part of the bar + # print('x1 after:', x1, '\n') + heat_elements.append( + draw_box((self.x0, tbtm), (x1, ttop), color=heat, border=None) + ) + + return heat_elements + + def draw_bar_graph(self, graph): + """Return list of drawable elements for a bar graph.""" + # print('\tdraw_bar_graph') + # At each point contained in the graph data, we draw a vertical bar + # from the track center to the height of the datapoint value (positive + # values go up in one color, negative go down in the alternative + # color). + bar_elements = [] # Holds drawable elements for the graph + + # Set the number of pixels per unit for the data + data_quartiles = graph.quartiles() + minval, maxval = data_quartiles[0], data_quartiles[4] + btm, ctr, top = self.track_offsets[self.current_track_level] + trackheight = 0.5 * (top - btm) + datarange = maxval - minval + if datarange == 0: + datarange = trackheight + data = graph[self.start : self.end] + # midval is the value at which the x-axis is plotted, and is the + # central ring in the track + if graph.center is None: + midval = (maxval + minval) / 2.0 + else: + midval = graph.center + + # Convert data into 'binned' blocks, covering half the distance to the + # next data point on either side, accounting for the ends of fragments + # and tracks + start, end = self._current_track_start_end() + data = intermediate_points(start, end, graph[start:end]) + + if not data: + return [] + + # Whichever is the greatest difference: max-midval or min-midval, is + # taken to specify the number of pixel units resolved along the + # y-axis + resolution = max((midval - minval), (maxval - midval)) + if resolution == 0: + resolution = trackheight + + # Create elements for the bar graph based on newdata + for pos0, pos1, val in data: + fragment0, x0 = self.canvas_location(pos0) + fragment1, x1 = self.canvas_location(pos1) + x0, x1 = self.x0 + x0, self.x0 + x1 # account for margin + barval = trackheight * (val - midval) / resolution + if barval >= 0: # Different colors for bars that extend above... + barcolor = graph.poscolor + else: # ...or below the axis + barcolor = graph.negcolor + + # Draw bar + if fragment0 == fragment1: # Box is contiguous + if pos1 >= self.fragment_limits[fragment0][1]: + x1 = self.xlim + tctr = ctr + self.fragment_lines[fragment0][0] + barval += tctr + bar_elements.append(draw_box((x0, tctr), (x1, barval), color=barcolor)) + else: # Box is split over two or more fragments + fragment = fragment0 + # if pos0 >= self.fragment_limits[fragment0][0]: + # fragment += 1 + start = x0 + while self.fragment_limits[fragment][1] < pos1: + tctr = ctr + self.fragment_lines[fragment][0] + thisbarval = barval + tctr + bar_elements.append( + draw_box((start, tctr), (self.xlim, thisbarval), color=barcolor) + ) + fragment += 1 + start = self.x0 + tctr = ctr + self.fragment_lines[fragment1][0] + barval += tctr + # Add the last part of the bar + bar_elements.append( + draw_box((self.x0, tctr), (x1, barval), color=barcolor) + ) + + return bar_elements + + def canvas_location(self, base): + """Canvas location of a base on the genome. + + Arguments: + - base The base number on the genome sequence + + Returns the x-coordinate and fragment number of a base on the + genome sequence, in the context of the current drawing setup + """ + base = int(base - self.start) # number of bases we are from the start + fragment = int(base / self.fragment_bases) + if fragment < 1: # First fragment + base_offset = base + fragment = 0 + elif fragment >= self.fragments: + fragment = self.fragments - 1 + base_offset = self.fragment_bases + else: # Calculate number of bases from start of fragment + base_offset = base % self.fragment_bases + assert fragment < self.fragments, ( + base, + self.start, + self.end, + self.length, + self.fragment_bases, + ) + # Calculate number of pixels from start of fragment + x_offset = 1.0 * self.pagewidth * base_offset / self.fragment_bases + return fragment, x_offset + + def _draw_sigil_box(self, bottom, center, top, x1, x2, strand, **kwargs): + """Draw BOX sigil (PRIVATE).""" + if strand == 1: + y1 = center + y2 = top + elif strand == -1: + y1 = bottom + y2 = center + else: + y1 = bottom + y2 = top + return draw_box((x1, y1), (x2, y2), **kwargs) + + def _draw_sigil_octo(self, bottom, center, top, x1, x2, strand, **kwargs): + """Draw OCTO sigil, a box with the corners cut off (PRIVATE).""" + if strand == 1: + y1 = center + y2 = top + elif strand == -1: + y1 = bottom + y2 = center + else: + y1 = bottom + y2 = top + return draw_cut_corner_box((x1, y1), (x2, y2), **kwargs) + + def _draw_sigil_jaggy( + self, bottom, center, top, x1, x2, strand, color, border=None, **kwargs + ): + """Draw JAGGY sigil (PRIVATE). + + Although we may in future expose the head/tail jaggy lengths, for now + both the left and right edges are drawn jagged. + """ + if strand == 1: + y1 = center + y2 = top + teeth = 2 + elif strand == -1: + y1 = bottom + y2 = center + teeth = 2 + else: + y1 = bottom + y2 = top + teeth = 4 + + xmin = min(x1, x2) + xmax = max(x1, x2) + height = y2 - y1 + boxwidth = x2 - x1 + tooth_length = min(height / teeth, boxwidth * 0.5) + + headlength = tooth_length + taillength = tooth_length + + strokecolor, color = _stroke_and_fill_colors(color, border) + + points = [] + for i in range(teeth): + points.extend( + ( + xmin, + y1 + i * height / teeth, + xmin + taillength, + y1 + (i + 1) * height / teeth, + ) + ) + for i in range(teeth): + points.extend( + ( + xmax, + y1 + (teeth - i) * height / teeth, + xmax - headlength, + y1 + (teeth - i - 1) * height / teeth, + ) + ) + + return Polygon( + deduplicate(points), + strokeColor=strokecolor, + strokeWidth=1, + strokeLineJoin=1, # 1=round + fillColor=color, + **kwargs + ) + + def _draw_sigil_arrow(self, bottom, center, top, x1, x2, strand, **kwargs): + """Draw ARROW sigil (PRIVATE).""" + if strand == 1: + y1 = center + y2 = top + orientation = "right" + elif strand == -1: + y1 = bottom + y2 = center + orientation = "left" + else: + y1 = bottom + y2 = top + orientation = "right" # backward compatibility + return draw_arrow((x1, y1), (x2, y2), orientation=orientation, **kwargs) + + def _draw_sigil_big_arrow(self, bottom, center, top, x1, x2, strand, **kwargs): + """Draw BIGARROW sigil, like ARROW but straddles the axis (PRIVATE).""" + if strand == -1: + orientation = "left" + else: + orientation = "right" + return draw_arrow((x1, bottom), (x2, top), orientation=orientation, **kwargs) diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Track.py b/code/lib/Bio/Graphics/GenomeDiagram/_Track.py new file mode 100644 index 0000000..a6c67f9 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/_Track.py @@ -0,0 +1,285 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +################################################################################ + +"""Track module. + +Provides: + - Track - Container for a single track on the diagram, containing + FeatureSet and GraphSet objects + +For drawing capabilities, this module uses reportlab to draw and write +the diagram: http://www.reportlab.com +""" + + +from reportlab.lib import colors + +# GenomeDiagram imports +from ._FeatureSet import FeatureSet +from ._GraphSet import GraphSet + +_grey = colors.Color(0.6, 0.6, 0.6) + + +class Track: + """Track. + + Attributes: + - height Int describing the relative height to other trackscale_fontsizes + in the diagram + - name String describing the track + - hide Boolean, 0 if the track is not to be drawn + - start, end Integers (or None) specifying start/end to draw just + a partial track. + - greytrack Boolean, 1 if a grey background to the track is to be + drawn + - greytrack_labels Int describing how many track-identifying labels + should be placed on the track at regular intervals + - greytrack_font String describing the font to use for the greytrack + labels + - greytrack_fontsize Int describing the font size to display the + labels on the grey track + - greytrack_font_rotation Int describing the angle through which to + rotate the grey track labels (Linear only) + - greytrack_font_color colors.Color describing the color to draw + the grey track labels + - scale Boolean, 1 if a scale is to be drawn on the track + - scale_format String, defaults to None, when scale values are written + as numerals. Setting this to 'SInt' invokes SI + unit-like multiples, such as Mbp, Kbp and so on. + - scale_color colors.Color to draw the elements of the scale + - scale_font String describing the font to use for the scale labels + - scale_fontsize Int describing the size of the scale label font + - scale_fontangle Int describing the angle at which to draw the scale + labels (linear only) + - scale_ticks Boolean, 1 if ticks should be drawn at all on the + scale + - scale_largeticks Float (0->1) describing the height of large + scale ticks relative to the track height. + - scale_smallticks Float (0->1) describing the height of large + scale ticks relative to the track height. + - scale_largetick_interval Int, describing the number of bases that + should separate large ticks + - scale_smalltick_interval Int, describing the number of bases that + should separate small ticks + - scale_largetick_labels Boolean describing whether position labels + should be written over large ticks + - scale_smalltick_labels Boolean describing whether position labels + should be written over small ticks + - axis_labels Boolean describing whether the value labels should + be placed on the Y axes + + """ + + def __init__( + self, + name=None, + height=1, + hide=0, + greytrack=0, + greytrack_labels=5, + greytrack_fontsize=8, + greytrack_font="Helvetica", + greytrack_font_rotation=0, + greytrack_font_color=_grey, + scale=1, + scale_format=None, + scale_color=colors.black, + scale_font="Helvetica", + scale_fontsize=6, + scale_fontangle=45, + scale_largeticks=0.5, + scale_ticks=1, + scale_smallticks=0.3, + scale_largetick_interval=1e6, + scale_smalltick_interval=1e4, + scale_largetick_labels=1, + scale_smalltick_labels=0, + axis_labels=1, + start=None, + end=None, + greytrack_font_colour=None, + scale_colour=None, + ): + """Initialize. + + Arguments: + - height Int describing the relative height to other tracks in the + diagram + - name String describing the track + - hide Boolean, 0 if the track is not to be drawn + - greytrack Boolean, 1 if a grey background to the track is to be + drawn + - greytrack_labels Int describing how many track-identifying labels + should be placed on the track at regular intervals + - greytrack_font String describing the font to use for the greytrack + labels + - greytrack_fontsize Int describing the font size to display the + labels on the grey track + - greytrack_font_rotation Int describing the angle through which to + rotate the grey track labels (Linear only) + - greytrack_font_color colors.Color describing the color to draw + the grey track labels (overridden by backwards compatible argument + with UK spelling, colour). + - scale Boolean, 1 if a scale is to be drawn on the track + - scale_color colors.Color to draw the elements of the scale + (overridden by backwards compatible argument with UK + spelling, colour). + - scale_font String describing the font to use for the scale labels + - scale_fontsize Int describing the size of the scale label font + - scale_fontangle Int describing the angle at which to draw the scale + labels (linear only) + - scale_ticks Boolean, 1 if ticks should be drawn at all on the + scale + - scale_largeticks Float (0->1) describing the height of large + scale ticks relative to the track height. + - scale_smallticks Float (0->1) describing the height of large + scale ticks relative to the track height. + - scale_largetick_interval Int, describing the number of bases that + should separate large ticks + - scale_smalltick_interval Int, describing the number of bases that + should separate small ticks + - scale_largetick_labels Boolean describing whether position labels + should be written over large ticks + - scale_smalltick_labels Boolean describing whether position labels + should be written over small ticks + - name String to help identify the track + - height Relative height to draw the track + - axis_labels Boolean describing whether the value labels should + be placed on the Y axes + + """ + # Let the UK spelling (colour) override the USA spelling (color) + if greytrack_font_colour is not None: + greytrack_font_color = greytrack_font_colour + if scale_colour is not None: + scale_color = scale_colour + + self._next_id = 0 # This will count sets as they are added to the track + self._sets = {} # Holds sets, keyed by unique ID + + # Assign attribute values from instantiation + self.height = height + if name is not None: + self.name = str(name) + else: + self.name = "Track" + self.hide = hide + self.start = start + self.end = end + + # Attributes for the grey track background and labels + self.greytrack = greytrack + self.greytrack_labels = greytrack_labels + self.greytrack_fontsize = greytrack_fontsize + self.greytrack_font = greytrack_font + self.greytrack_font_rotation = greytrack_font_rotation + self.greytrack_fontcolor = greytrack_font_color + + # Attributes for the track scale + self.scale = scale + self.scale_format = scale_format + self.scale_color = scale_color + self.scale_font = scale_font + self.scale_fontsize = scale_fontsize + self.scale_fontangle = scale_fontangle + self.scale_ticks = scale_ticks + self.scale_largeticks = scale_largeticks + self.scale_smallticks = scale_smallticks + self.scale_largetick_interval = scale_largetick_interval + self.scale_smalltick_interval = scale_smalltick_interval + self.scale_largetick_labels = scale_largetick_labels + self.scale_smalltick_labels = scale_smalltick_labels + self.axis_labels = axis_labels + + def add_set(self, set): + """Add a preexisting FeatureSet or GraphSet object to the track.""" + set.id = self._next_id # Assign unique id to set + set.parent = self # Make set's parent this track + self._sets[self._next_id] = set # Add set, keyed by unique id + self._next_id += 1 # Increment unique set ids + + def new_set(self, type="feature", **args): + """Create a new FeatureSet or GraphSet object. + + Create a new FeatureSet or GraphSet object, add it to the + track, and return for user manipulation + """ + type_dict = {"feature": FeatureSet, "graph": GraphSet} + set = type_dict[type]() + for key in args: + setattr(set, key, args[key]) + set.id = self._next_id # Assign unique id to set + set.parent = self # Make set's parent this track + self._sets[self._next_id] = set # Add set, keyed by unique id + self._next_id += 1 # Increment unique set ids + return set + + def del_set(self, set_id): + """Remove the set with the passed id from the track.""" + del self._sets[set_id] + + def get_sets(self): + """Return the sets contained in this track.""" + return list(self._sets.values()) + + def get_ids(self): + """Return the ids of all sets contained in this track.""" + return list(self._sets.keys()) + + def range(self): + """Return the lowest and highest base (or mark) numbers as a tuple.""" + lows, highs = [], [] # Holds set of low and high values from sets + if self.start is not None: + lows.append(self.start) + if self.end is not None: + highs.append(self.end) + for set in self._sets.values(): + low, high = set.range() # Get each set range + lows.append(low) + highs.append(high) + if lows: + low = min(lows) + else: + low = None + if highs: + high = max(highs) + else: + high = None + return low, high # Return lowest and highest values + + def to_string(self, verbose=0): + """Return a formatted string with information about the track. + + Arguments: + - verbose - Boolean indicating whether a short or complete + account of the track is required + + """ + if not verbose: # Return the short description + return "%s" % self # Use __str__ method instead + else: # Return the long description + outstr = ["\n<%s: %s>" % (self.__class__, self.name)] + outstr.append("%d sets" % len(self._sets)) + for key in self._sets: + outstr.append("set: %s" % self._sets[key]) + return "\n".join(outstr) + + def __getitem__(self, key): + """Return the set with the passed id.""" + return self._sets[key] + + def __str__(self): + """Return a formatted string with information about the Track.""" + outstr = ["\n<%s: %s>" % (self.__class__, self.name)] + outstr.append("%d sets" % len(self._sets)) + return "\n".join(outstr) diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__init__.py b/code/lib/Bio/Graphics/GenomeDiagram/__init__.py new file mode 100644 index 0000000..ca40d28 --- /dev/null +++ b/code/lib/Bio/Graphics/GenomeDiagram/__init__.py @@ -0,0 +1,37 @@ +# Copyright 2003-2008 by Leighton Pritchard. All rights reserved. +# Revisions copyright 2009 by Peter Cock. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +# +# Contact: Leighton Pritchard, The James Hutton Institute, +# Invergowrie, Dundee, Scotland, DD2 5DA, UK +# Leighton.Pritchard@hutton.ac.uk +# ############################################################################# + +"""GenomeDiagram module integrated into Biopython.""" + +# Local imports, to make these classes available directly under the +# Bio.Graphics.GenomeDiagram namespace: + +from ._Diagram import Diagram +from ._Track import Track +from ._FeatureSet import FeatureSet +from ._GraphSet import GraphSet +from ._CrossLink import CrossLink +from ._Colors import ColorTranslator +from ._Feature import Feature +from ._Graph import GraphData + +__all__ = ( + "Diagram", + "Track", + "FeatureSet", + "Feature", + "GraphSet", + "GraphData", + "CrossLink", + "ColorTranslator", +) diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_AbstractDrawer.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_AbstractDrawer.cpython-37.pyc new file mode 100644 index 0000000..b0e8b84 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_AbstractDrawer.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CircularDrawer.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CircularDrawer.cpython-37.pyc new file mode 100644 index 0000000..e580415 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CircularDrawer.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Colors.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Colors.cpython-37.pyc new file mode 100644 index 0000000..a8989e5 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Colors.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CrossLink.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CrossLink.cpython-37.pyc new file mode 100644 index 0000000..2b61193 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CrossLink.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Diagram.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Diagram.cpython-37.pyc new file mode 100644 index 0000000..9ae04f2 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Diagram.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Feature.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Feature.cpython-37.pyc new file mode 100644 index 0000000..4dab43a Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Feature.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_FeatureSet.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_FeatureSet.cpython-37.pyc new file mode 100644 index 0000000..ac57f08 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_FeatureSet.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Graph.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Graph.cpython-37.pyc new file mode 100644 index 0000000..3aecb02 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Graph.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_GraphSet.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_GraphSet.cpython-37.pyc new file mode 100644 index 0000000..e9a7dc7 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_GraphSet.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_LinearDrawer.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_LinearDrawer.cpython-37.pyc new file mode 100644 index 0000000..bd7db74 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_LinearDrawer.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Track.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Track.cpython-37.pyc new file mode 100644 index 0000000..959ca99 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Track.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..6f04517 Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/KGML_vis.py b/code/lib/Bio/Graphics/KGML_vis.py new file mode 100644 index 0000000..9a09086 --- /dev/null +++ b/code/lib/Bio/Graphics/KGML_vis.py @@ -0,0 +1,443 @@ +# Copyright 2013 Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Classes and functions to visualise a KGML Pathway Map. + +The KGML definition is as of release KGML v0.7.1 +(http://www.kegg.jp/kegg/xml/docs/) + +Classes: +""" + + +import os +import tempfile +from io import BytesIO + +try: + from reportlab.lib import colors + from reportlab.pdfgen import canvas +except ImportError: + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError( + "Install reportlab if you want to use KGML_vis." + ) from None + +try: + from PIL import Image +except ImportError: + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError( + "Install pillow if you want to use KGML_vis." + ) from None + +from urllib.request import urlopen + +from Bio.KEGG.KGML.KGML_pathway import Pathway + + +def darken(color, factor=0.7): + """Return darkened color as a ReportLab RGB color. + + Take a passed color and returns a Reportlab color that is darker by the + factor indicated in the parameter. + """ + newcol = color_to_reportlab(color) + for a in ["red", "green", "blue"]: + setattr(newcol, a, factor * getattr(newcol, a)) + return newcol + + +def color_to_reportlab(color): + """Return the passed color in Reportlab Color format. + + We allow colors to be specified as hex values, tuples, or Reportlab Color + objects, and with or without an alpha channel. This function acts as a + Rosetta stone for conversion of those formats to a Reportlab Color + object, with alpha value. + + Any other color specification is returned directly + """ + # Reportlab Color objects are in the format we want already + if isinstance(color, colors.Color): + return color + elif isinstance(color, str): # String implies hex color + if color.startswith("0x"): # Standardise to octothorpe + color.replace("0x", "#") + if len(color) == 7: + return colors.HexColor(color) + else: + try: + return colors.HexColor(color, hasAlpha=True) + except TypeError: # Catch pre-2.7 Reportlab + raise RuntimeError( + "Your reportlab seems to be too old, try 2.7 onwards" + ) from None + elif isinstance(color, tuple): # Tuple implies RGB(alpha) tuple + return colors.Color(*color) + return color + + +def get_temp_imagefilename(url): + """Return filename of temporary file containing downloaded image. + + Create a new temporary file to hold the image file at the passed URL + and return the filename. + """ + img = urlopen(url).read() + im = Image.open(BytesIO(img)) + # im.transpose(Image.FLIP_TOP_BOTTOM) + f = tempfile.NamedTemporaryFile(delete=False, suffix=".png") + fname = f.name + f.close() + im.save(fname, "PNG") + return fname + + +class KGMLCanvas: + """Reportlab Canvas-based representation of a KGML pathway map.""" + + def __init__( + self, + pathway, + import_imagemap=False, + label_compounds=True, + label_orthologs=True, + label_reaction_entries=True, + label_maps=True, + show_maps=False, + fontname="Helvetica", + fontsize=6, + draw_relations=True, + show_orthologs=True, + show_compounds=True, + show_genes=True, + show_reaction_entries=True, + margins=(0.02, 0.02), + ): + """Initialize the class.""" + self.pathway = pathway + self.show_maps = show_maps + self.show_orthologs = show_orthologs + self.show_compounds = show_compounds + self.show_genes = show_genes + self.show_reaction_entries = show_reaction_entries + self.label_compounds = label_compounds + self.label_orthologs = label_orthologs + self.label_reaction_entries = label_reaction_entries + self.label_maps = label_maps + self.fontname = fontname + self.fontsize = fontsize + self.draw_relations = draw_relations + self.non_reactant_transparency = 0.3 + self.import_imagemap = import_imagemap # Import the map .png from URL + # percentage of canvas that will be margin in on either side in the + # X and Y directions + self.margins = margins + + def draw(self, filename): + """Add the map elements to the drawing.""" + # Instantiate the drawing, first + # size x_max, y_max for now - we can add margins, later + if self.import_imagemap: + # We're drawing directly on the image, so we set the canvas to the + # same size as the image + if os.path.isfile(self.pathway.image): + imfilename = self.pathway.image + else: + imfilename = get_temp_imagefilename(self.pathway.image) + im = Image.open(imfilename) + cwidth, cheight = im.size + else: + # No image, so we set the canvas size to accommodate visible + # elements + cwidth, cheight = (self.pathway.bounds[1][0], self.pathway.bounds[1][1]) + # Instantiate canvas + self.drawing = canvas.Canvas( + filename, + bottomup=0, + pagesize=( + cwidth * (1 + 2 * self.margins[0]), + cheight * (1 + 2 * self.margins[1]), + ), + ) + self.drawing.setFont(self.fontname, self.fontsize) + # Transform the canvas to add the margins + self.drawing.translate( + self.margins[0] * self.pathway.bounds[1][0], + self.margins[1] * self.pathway.bounds[1][1], + ) + # Add the map image, if required + if self.import_imagemap: + self.drawing.saveState() + self.drawing.scale(1, -1) + self.drawing.translate(0, -cheight) + self.drawing.drawImage(imfilename, 0, 0) + self.drawing.restoreState() + # Add the reactions, compounds and maps + # Maps go on first, to be overlaid by more information. + # By default, they're slightly transparent. + if self.show_maps: + self.__add_maps() + if self.show_reaction_entries: + self.__add_reaction_entries() + if self.show_orthologs: + self.__add_orthologs() + if self.show_compounds: + self.__add_compounds() + if self.show_genes: + self.__add_genes() + # TODO: complete draw_relations code + # if self.draw_relations: + # self.__add_relations() + # Write the pathway map to PDF + self.drawing.save() + + def __add_maps(self): + """Add maps to the drawing of the map (PRIVATE). + + We do this first, as they're regional labels to be overlaid by + information. Also, we want to set the color to something subtle. + + We're using Hex colors because that's what KGML uses, and + Reportlab doesn't mind. + """ + for m in self.pathway.maps: + for g in m.graphics: + self.drawing.setStrokeColor("#888888") + self.drawing.setFillColor("#DDDDDD") + self.__add_graphics(g) + if self.label_maps: + self.drawing.setFillColor("#888888") + self.__add_labels(g) + + def __add_graphics(self, graphics): + """Add the passed graphics object to the map (PRIVATE). + + Add text, add after the graphics object, for sane Z-ordering. + """ + if graphics.type == "line": + p = self.drawing.beginPath() + x, y = graphics.coords[0] + # There are optional settings for lines that aren't necessarily + # part of the KGML DTD + if graphics.width is not None: + self.drawing.setLineWidth(graphics.width) + else: + self.drawing.setLineWidth(1) + p.moveTo(x, y) + for (x, y) in graphics.coords: + p.lineTo(x, y) + self.drawing.drawPath(p) + self.drawing.setLineWidth(1) # Return to default + # KGML defines the (x, y) coordinates as the centre of the circle/ + # rectangle/roundrectangle, but Reportlab uses the co-ordinates of the + # lower-left corner for rectangle/elif. + if graphics.type == "circle": + self.drawing.circle( + graphics.x, graphics.y, graphics.width * 0.5, stroke=1, fill=1 + ) + elif graphics.type == "roundrectangle": + self.drawing.roundRect( + graphics.x - graphics.width * 0.5, + graphics.y - graphics.height * 0.5, + graphics.width, + graphics.height, + min(graphics.width, graphics.height) * 0.1, + stroke=1, + fill=1, + ) + elif graphics.type == "rectangle": + self.drawing.rect( + graphics.x - graphics.width * 0.5, + graphics.y - graphics.height * 0.5, + graphics.width, + graphics.height, + stroke=1, + fill=1, + ) + + def __add_labels(self, graphics): + """Add labels for the passed graphics objects to the map (PRIVATE). + + We don't check that the labels fit inside objects such as circles/ + rectangles/roundrectangles. + """ + if graphics.type == "line": + # We use the midpoint of the line - sort of - we take the median + # line segment (list-wise, not in terms of length), and use the + # midpoint of that line. We could have other options here, + # maybe even parameterising it to a proportion of the total line + # length. + mid_idx = len(graphics.coords) * 0.5 + if not int(mid_idx) == mid_idx: + idx1, idx2 = int(mid_idx - 0.5), int(mid_idx + 0.5) + else: + idx1, idx2 = int(mid_idx - 1), int(mid_idx) + x1, y1 = graphics.coords[idx1] + x2, y2 = graphics.coords[idx2] + x, y = 0.5 * (x1 + x2), 0.5 * (y1 + y2) + elif graphics.type == "circle": + x, y = graphics.x, graphics.y + elif graphics.type in ("rectangle", "roundrectangle"): + x, y = graphics.x, graphics.y + # How big so we want the text, and how many characters? + if graphics._parent.type == "map": + text = graphics.name + self.drawing.setFont(self.fontname, self.fontsize + 2) + elif len(graphics.name) < 15: + text = graphics.name + else: + text = graphics.name[:12] + "..." + self.drawing.drawCentredString(x, y, text) + self.drawing.setFont(self.fontname, self.fontsize) + + def __add_orthologs(self): + """Add 'ortholog' Entry elements to the drawing of the map (PRIVATE). + + In KGML, these are typically line objects, so we render them + before the compound circles to cover the unsightly ends/junctions. + """ + for ortholog in self.pathway.orthologs: + for g in ortholog.graphics: + self.drawing.setStrokeColor(color_to_reportlab(g.fgcolor)) + self.drawing.setFillColor(color_to_reportlab(g.bgcolor)) + self.__add_graphics(g) + if self.label_orthologs: + # We want the label color to be slightly darker + # (where possible), so it can be read + self.drawing.setFillColor(darken(g.fgcolor)) + self.__add_labels(g) + + def __add_reaction_entries(self): + """Add Entry elements for Reactions to the map drawing (PRIVATE). + + In KGML, these are typically line objects, so we render them + before the compound circles to cover the unsightly ends/junctions + """ + for reaction in self.pathway.reaction_entries: + for g in reaction.graphics: + self.drawing.setStrokeColor(color_to_reportlab(g.fgcolor)) + self.drawing.setFillColor(color_to_reportlab(g.bgcolor)) + self.__add_graphics(g) + if self.label_reaction_entries: + # We want the label color to be slightly darker + # (where possible), so it can be read + self.drawing.setFillColor(darken(g.fgcolor)) + self.__add_labels(g) + + def __add_compounds(self): + """Add compound elements to the drawing of the map (PRIVATE).""" + for compound in self.pathway.compounds: + for g in compound.graphics: + # Modify transparency of compounds that don't participate + # in reactions + fillcolor = color_to_reportlab(g.bgcolor) + if not compound.is_reactant: + fillcolor.alpha *= self.non_reactant_transparency + self.drawing.setStrokeColor(color_to_reportlab(g.fgcolor)) + self.drawing.setFillColor(fillcolor) + self.__add_graphics(g) + if self.label_compounds: + if not compound.is_reactant: + t = 0.3 + else: + t = 1 + self.drawing.setFillColor(colors.Color(0.2, 0.2, 0.2, t)) + self.__add_labels(g) + + def __add_genes(self): + """Add gene elements to the drawing of the map (PRIVATE).""" + for gene in self.pathway.genes: + for g in gene.graphics: + self.drawing.setStrokeColor(color_to_reportlab(g.fgcolor)) + self.drawing.setFillColor(color_to_reportlab(g.bgcolor)) + self.__add_graphics(g) + if self.label_compounds: + self.drawing.setFillColor(darken(g.fgcolor)) + self.__add_labels(g) + + def __add_relations(self): + """Add relations to the map (PRIVATE). + + This is tricky. There is no defined graphic in KGML for a + relation, and the corresponding entries are typically defined + as objects 'to be connected somehow'. KEGG uses KegSketch, which + is not public, and most third-party software draws straight line + arrows, with heads to indicate the appropriate direction + (at both ends for reversible reactions), using solid lines for + ECrel relation types, and dashed lines for maplink relation types. + + The relation has: + - entry1: 'from' node + - entry2: 'to' node + - subtype: what the relation refers to + + Typically we have entry1 = map/ortholog; entry2 = map/ortholog, + subtype = compound. + """ + # Dashed lines for maplinks, solid for everything else + for relation in list(self.pathway.relations): + if relation.type == "maplink": + self.drawing.setDash(6, 3) + else: + self.drawing.setDash() + for s in relation.subtypes: + subtype = self.pathway.entries[s[1]] + # Our aim is to draw an arrow from the entry1 object to the + # entry2 object, via the subtype object. + # 1) Entry 1 to subtype + self.__draw_arrow(relation.entry1, subtype) + # 2) subtype to Entry 2 + self.__draw_arrow(subtype, relation.entry2) + + def __draw_arrow(self, g_from, g_to): + """Draw an arrow between given Entry objects (PRIVATE). + + Draws an arrow from the g_from Entry object to the g_to + Entry object; both must have Graphics objects. + """ + # Centre and bound co-ordinates for the from and two objects + bounds_from, bounds_to = g_from.bounds, g_to.bounds + centre_from = ( + 0.5 * (bounds_from[0][0] + bounds_from[1][0]), + 0.5 * (bounds_from[0][1] + bounds_from[1][1]), + ) + centre_to = ( + 0.5 * (bounds_to[0][0] + bounds_to[1][0]), + 0.5 * (bounds_to[0][1] + bounds_to[1][1]), + ) + p = self.drawing.beginPath() + # print(True, g_from.name, g_to.name, bounds_to, bounds_from) + # If the 'from' and 'to' graphics are vertically-aligned, draw a line + # from the 'from' to the 'to' entity + if bounds_to[0][0] < centre_from[0] < bounds_to[1][0]: + # print(True, g_from.name, g_to.name, bounds_to, bounds_from) + if centre_to[1] > centre_from[1]: # to above from + p.moveTo(centre_from[0], bounds_from[1][1]) + p.lineTo(centre_from[0], bounds_to[0][1]) + # Draw arrow point - TODO + else: # to below from + p.moveTo(centre_from[0], bounds_from[0][1]) + p.lineTo(centre_from[0], bounds_to[1][1]) + # Draw arrow point - TODO + elif bounds_from[0][0] < centre_to[0] < bounds_from[1][0]: + # print(True, g_from.name, g_to.name, bounds_to, bounds_from) + if centre_to[1] > centre_from[1]: # to above from + p.moveTo(centre_to[0], bounds_from[1][1]) + p.lineTo(centre_to[0], bounds_to[0][1]) + # Draw arrow point - TODO + else: # to below from + p.moveTo(centre_to[0], bounds_from[0][1]) + p.lineTo(centre_to[0], bounds_to[1][1]) + # Draw arrow point - TODO + self.drawing.drawPath(p) # Draw arrow shaft + # print(g_from) + # print(bounds_from) + # print(g_to) + # print(bounds_to) diff --git a/code/lib/Bio/Graphics/__init__.py b/code/lib/Bio/Graphics/__init__.py new file mode 100644 index 0000000..8720bb4 --- /dev/null +++ b/code/lib/Bio/Graphics/__init__.py @@ -0,0 +1,90 @@ +# Copyright 2008 by Brad Chapman. All rights reserved. +# Copyright 2008 by Michiel de Hoon. All rights reserved. +# Copyright 2009-2017 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.Graphics offers several graphical outputs, all using ReportLab.""" + +# Check if ReportLab is installed. +try: + import reportlab as r + + del r +except ImportError: + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError( + "Please install ReportLab if you want " + "to use Bio.Graphics. You can find ReportLab at " + "http://www.reportlab.com/software/opensource/" + ) from None + + +# The following code is to allow all the Bio.Graphics +# code to deal with the different ReportLab renderers +# and the API quirks consistently. + + +def _write(drawing, output_file, format, dpi=72): + """Standardize output to files (PRIVATE). + + Writes the provided drawing out to a file in a prescribed format. + + - drawing - suitable ReportLab drawing object. + - output_file - a handle to write to, or a filename to write to. + - format - String indicating output format, one of PS, PDF, SVG, + or provided the ReportLab renderPM module is installed, + one of the bitmap formats JPG, BMP, GIF, PNG, TIFF or TIFF. + The format can be given in any case. + - dpi - Resolution (dots per inch) for bitmap formats. + + No return value. + """ + from reportlab.graphics import renderPS, renderPDF, renderSVG + + try: + from reportlab.graphics import renderPM + except ImportError: + # This is an optional part of ReportLab, so may not be installed. + # We'll raise a missing dependency error if rendering to a + # bitmap format is attempted. + renderPM = None + + formatdict = { + "PS": renderPS, + "EPS": renderPS, + # not sure which you actually get, PS or EPS, but + # GenomeDiagram used PS while other modules used EPS. + "PDF": renderPDF, + "SVG": renderSVG, + "JPG": renderPM, + "BMP": renderPM, + "GIF": renderPM, + "PNG": renderPM, + "TIFF": renderPM, + "TIF": renderPM, + } + try: + # If output is not a string, then .upper() will trigger + # an attribute error... + drawmethod = formatdict[format.upper()] # select drawing method + except (KeyError, AttributeError): + raise ValueError( + "Output format should be one of %s" % ", ".join(formatdict) + ) from None + + if drawmethod is None: + # i.e. We wanted renderPM but it isn't installed + # See the import at the top of the function. + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError("Please install ReportLab's renderPM module") + + if drawmethod == renderPM: + # This has a different API to the other render objects + return drawmethod.drawToFile(drawing, output_file, format, dpi=dpi) + else: + return drawmethod.drawToFile(drawing, output_file) diff --git a/code/lib/Bio/Graphics/__pycache__/BasicChromosome.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/BasicChromosome.cpython-37.pyc new file mode 100644 index 0000000..366a8eb Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/BasicChromosome.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/__pycache__/ColorSpiral.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/ColorSpiral.cpython-37.pyc new file mode 100644 index 0000000..fdef3ca Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/ColorSpiral.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/__pycache__/Comparative.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/Comparative.cpython-37.pyc new file mode 100644 index 0000000..7a2e4da Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/Comparative.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/__pycache__/DisplayRepresentation.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/DisplayRepresentation.cpython-37.pyc new file mode 100644 index 0000000..363b6b5 Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/DisplayRepresentation.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/__pycache__/Distribution.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/Distribution.cpython-37.pyc new file mode 100644 index 0000000..aa31c44 Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/Distribution.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/__pycache__/KGML_vis.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/KGML_vis.cpython-37.pyc new file mode 100644 index 0000000..f086a54 Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/KGML_vis.cpython-37.pyc differ diff --git a/code/lib/Bio/Graphics/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..22ca765 Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/HMM/DynamicProgramming.py b/code/lib/Bio/HMM/DynamicProgramming.py new file mode 100644 index 0000000..9f9b095 --- /dev/null +++ b/code/lib/Bio/HMM/DynamicProgramming.py @@ -0,0 +1,326 @@ +# Copyright 2001 Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Dynamic Programming algorithms for general usage. + +This module contains classes which implement Dynamic Programming +algorithms that can be used generally. +""" + + +class AbstractDPAlgorithms: + """An abstract class to calculate forward and backward probabilities. + + This class should not be instantiated directly, but should be used + through a derived class which implements proper scaling of variables. + + This class is just meant to encapsulate the basic forward and backward + algorithms, and allow derived classes to deal with the problems of + multiplying probabilities. + + Derived class of this must implement: + + - _forward_recursion -- Calculate the forward values in the recursion + using some kind of technique for preventing underflow errors. + - _backward_recursion -- Calculate the backward values in the recursion + step using some technique to prevent underflow errors. + + """ + + def __init__(self, markov_model, sequence): + """Initialize to calculate forward and backward probabilities. + + Arguments: + - markov_model -- The current Markov model we are working with. + - sequence -- A training sequence containing a set of emissions. + + """ + self._mm = markov_model + self._seq = sequence + + def _forward_recursion(self, cur_state, sequence_pos, forward_vars): + """Calculate the forward recursion value (PRIVATE).""" + raise NotImplementedError("Subclasses must implement") + + def forward_algorithm(self): + """Calculate sequence probability using the forward algorithm. + + This implements the forward algorithm, as described on p57-58 of + Durbin et al. + + Returns: + - A dictionary containing the forward variables. This has keys of the + form (state letter, position in the training sequence), and values + containing the calculated forward variable. + - The calculated probability of the sequence. + + """ + # all of the different letters that the state path can be in + state_letters = self._mm.state_alphabet + + # -- initialize the algorithm + # + # NOTE: My index numbers are one less than what is given in Durbin + # et al, since we are indexing the sequence going from 0 to + # (Length - 1) not 1 to Length, like in Durbin et al. + # + forward_var = {} + # f_{0}(0) = 1 + forward_var[(state_letters[0], -1)] = 1 + # f_{k}(0) = 0, for k > 0 + for k in range(1, len(state_letters)): + forward_var[(state_letters[k], -1)] = 0 + + # -- now do the recursion step + # loop over the training sequence + # Recursion step: (i = 1 .. L) + for i in range(len(self._seq.emissions)): + # now loop over the letters in the state path + for main_state in state_letters: + # calculate the forward value using the appropriate + # method to prevent underflow errors + forward_value = self._forward_recursion(main_state, i, forward_var) + + if forward_value is not None: + forward_var[(main_state, i)] = forward_value + + # -- termination step - calculate the probability of the sequence + first_state = state_letters[0] + seq_prob = 0 + + for state_item in state_letters: + # f_{k}(L) + forward_value = forward_var[(state_item, len(self._seq.emissions) - 1)] + # a_{k0} + transition_value = self._mm.transition_prob[(state_item, first_state)] + + seq_prob += forward_value * transition_value + + return forward_var, seq_prob + + def _backward_recursion(self, cur_state, sequence_pos, forward_vars): + """Calculate the backward recursion value (PRIVATE).""" + raise NotImplementedError("Subclasses must implement") + + def backward_algorithm(self): + """Calculate sequence probability using the backward algorithm. + + This implements the backward algorithm, as described on p58-59 of + Durbin et al. + + Returns: + - A dictionary containing the backwards variables. This has keys + of the form (state letter, position in the training sequence), + and values containing the calculated backward variable. + + """ + # all of the different letters that the state path can be in + state_letters = self._mm.state_alphabet + + # -- initialize the algorithm + # + # NOTE: My index numbers are one less than what is given in Durbin + # et al, since we are indexing the sequence going from 0 to + # (Length - 1) not 1 to Length, like in Durbin et al. + # + backward_var = {} + + first_letter = state_letters[0] + # b_{k}(L) = a_{k0} for all k + for state in state_letters: + backward_var[ + (state, len(self._seq.emissions) - 1) + ] = self._mm.transition_prob[(state, state_letters[0])] + + # -- recursion + # first loop over the training sequence backwards + # Recursion step: (i = L - 1 ... 1) + all_indexes = list(range(len(self._seq.emissions) - 1)) + all_indexes.reverse() + for i in all_indexes: + # now loop over the letters in the state path + for main_state in state_letters: + # calculate the backward value using the appropriate + # method to prevent underflow errors + backward_value = self._backward_recursion(main_state, i, backward_var) + + if backward_value is not None: + backward_var[(main_state, i)] = backward_value + + # skip the termination step to avoid recalculations -- you should + # get sequence probabilities using the forward algorithm + + return backward_var + + +class ScaledDPAlgorithms(AbstractDPAlgorithms): + """Implement forward and backward algorithms using a rescaling approach. + + This scales the f and b variables, so that they remain within a + manageable numerical interval during calculations. This approach is + described in Durbin et al. on p 78. + + This approach is a little more straightforward then log transformation + but may still give underflow errors for some types of models. In these + cases, the LogDPAlgorithms class should be used. + """ + + def __init__(self, markov_model, sequence): + """Initialize the scaled approach to calculating probabilities. + + Arguments: + - markov_model -- The current Markov model we are working with. + - sequence -- A TrainingSequence object that must have a + set of emissions to work with. + + """ + AbstractDPAlgorithms.__init__(self, markov_model, sequence) + + self._s_values = {} + + def _calculate_s_value(self, seq_pos, previous_vars): + """Calculate the next scaling variable for a sequence position (PRIVATE). + + This utilizes the approach of choosing s values such that the + sum of all of the scaled f values is equal to 1. + + Arguments: + - seq_pos -- The current position we are at in the sequence. + - previous_vars -- All of the forward or backward variables + calculated so far. + + Returns: + - The calculated scaling variable for the sequence item. + + """ + # all of the different letters the state can have + state_letters = self._mm.state_alphabet + + # loop over all of the possible states + s_value = 0 + for main_state in state_letters: + emission = self._mm.emission_prob[ + (main_state, self._seq.emissions[seq_pos]) + ] + + # now sum over all of the previous vars and transitions + trans_and_var_sum = 0 + for second_state in self._mm.transitions_from(main_state): + # the value of the previous f or b value + var_value = previous_vars[(second_state, seq_pos - 1)] + + # the transition probability + trans_value = self._mm.transition_prob[(second_state, main_state)] + + trans_and_var_sum += var_value * trans_value + + s_value += emission * trans_and_var_sum + + return s_value + + def _forward_recursion(self, cur_state, sequence_pos, forward_vars): + """Calculate the value of the forward recursion (PRIVATE). + + Arguments: + - cur_state -- The letter of the state we are calculating the + forward variable for. + - sequence_pos -- The position we are at in the training seq. + - forward_vars -- The current set of forward variables + + """ + # calculate the s value, if we haven't done so already (ie. during + # a previous forward or backward recursion) + if sequence_pos not in self._s_values: + self._s_values[sequence_pos] = self._calculate_s_value( + sequence_pos, forward_vars + ) + + # e_{l}(x_{i}) + seq_letter = self._seq.emissions[sequence_pos] + cur_emission_prob = self._mm.emission_prob[(cur_state, seq_letter)] + # divide by the scaling value + scale_emission_prob = float(cur_emission_prob) / float( + self._s_values[sequence_pos] + ) + + # loop over all of the possible states at the position + state_pos_sum = 0 + have_transition = 0 + for second_state in self._mm.transitions_from(cur_state): + have_transition = 1 + + # get the previous forward_var values + # f_{k}(i - 1) + prev_forward = forward_vars[(second_state, sequence_pos - 1)] + + # a_{kl} + cur_trans_prob = self._mm.transition_prob[(second_state, cur_state)] + state_pos_sum += prev_forward * cur_trans_prob + + # if we have the possibility of having a transition + # return the recursion value + if have_transition: + return scale_emission_prob * state_pos_sum + else: + return None + + def _backward_recursion(self, cur_state, sequence_pos, backward_vars): + """Calculate the value of the backward recursion (PRIVATE). + + Arguments: + - cur_state -- The letter of the state we are calculating the + forward variable for. + - sequence_pos -- The position we are at in the training seq. + - backward_vars -- The current set of backward variables + + """ + # calculate the s value, if we haven't done so already (ie. during + # a previous forward or backward recursion) + if sequence_pos not in self._s_values: + self._s_values[sequence_pos] = self._calculate_s_value( + sequence_pos, backward_vars + ) + + # loop over all of the possible states at the position + state_pos_sum = 0 + have_transition = 0 + for second_state in self._mm.transitions_from(cur_state): + have_transition = 1 + # e_{l}(x_{i + 1}) + seq_letter = self._seq.emissions[sequence_pos + 1] + cur_emission_prob = self._mm.emission_prob[(cur_state, seq_letter)] + + # get the previous backward_var value + # b_{l}(i + 1) + prev_backward = backward_vars[(second_state, sequence_pos + 1)] + + # the transition probability -- a_{kl} + cur_transition_prob = self._mm.transition_prob[(cur_state, second_state)] + + state_pos_sum += cur_emission_prob * prev_backward * cur_transition_prob + + # if we have a probability for a transition, return it + if have_transition: + return state_pos_sum / float(self._s_values[sequence_pos]) + # otherwise we have no probability (ie. we can't do this transition) + # and return None + else: + return None + + +class LogDPAlgorithms(AbstractDPAlgorithms): + """Implement forward and backward algorithms using a log approach. + + This uses the approach of calculating the sum of log probabilities + using a lookup table for common values. + + XXX This is not implemented yet! + """ + + def __init__(self, markov_model, sequence): + """Initialize the class.""" + raise NotImplementedError("Haven't coded this yet...") diff --git a/code/lib/Bio/HMM/MarkovModel.py b/code/lib/Bio/HMM/MarkovModel.py new file mode 100644 index 0000000..ef9fef6 --- /dev/null +++ b/code/lib/Bio/HMM/MarkovModel.py @@ -0,0 +1,677 @@ +# Copyright 2001 Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Deal with representations of Markov Models.""" +# standard modules +import copy +import math +import random +from collections import defaultdict + +from Bio.Seq import Seq + + +def _gen_random_array(n): + """Return an array of n random numbers summing to 1.0 (PRIVATE).""" + randArray = [random.random() for _ in range(n)] + total = sum(randArray) + return [x / total for x in randArray] + + +def _calculate_emissions(emission_probs): + """Calculate which symbols can be emitted in each state (PRIVATE).""" + # loop over all of the state-symbol duples, mapping states to + # lists of emitted symbols + emissions = defaultdict(list) + for state, symbol in emission_probs: + emissions[state].append(symbol) + + return emissions + + +def _calculate_from_transitions(trans_probs): + """Calculate which 'from transitions' are allowed for each state (PRIVATE). + + This looks through all of the trans_probs, and uses this dictionary + to determine allowed transitions. It converts this information into + a dictionary, whose keys are source states and whose values are + lists of destination states reachable from the source state via a + transition. + """ + transitions = defaultdict(list) + for from_state, to_state in trans_probs: + transitions[from_state].append(to_state) + + return transitions + + +def _calculate_to_transitions(trans_probs): + """Calculate which 'to transitions' are allowed for each state (PRIVATE). + + This looks through all of the trans_probs, and uses this dictionary + to determine allowed transitions. It converts this information into + a dictionary, whose keys are destination states and whose values are + lists of source states from which the destination is reachable via a + transition. + """ + transitions = defaultdict(list) + for from_state, to_state in trans_probs: + transitions[to_state].append(from_state) + + return transitions + + +class MarkovModelBuilder: + """Interface to build up a Markov Model. + + This class is designed to try to separate the task of specifying the + Markov Model from the actual model itself. This is in hopes of making + the actual Markov Model classes smaller. + + So, this builder class should be used to create Markov models instead + of trying to initiate a Markov Model directly. + """ + + # the default pseudo counts to use + DEFAULT_PSEUDO = 1 + + def __init__(self, state_alphabet, emission_alphabet): + """Initialize a builder to create Markov Models. + + Arguments: + - state_alphabet -- An iterable (e.g., tuple or list) containing + all of the letters that can appear in the states + - emission_alphabet -- An iterable (e.g., tuple or list) containing + all of the letters for states that can be emitted by the HMM. + + """ + self._state_alphabet = tuple(state_alphabet) + self._emission_alphabet = tuple(emission_alphabet) + + # probabilities for the initial state, initialized by calling + # set_initial_probabilities (required) + self.initial_prob = {} + + # the probabilities for transitions and emissions + # by default we have no transitions and all possible emissions + self.transition_prob = {} + self.emission_prob = self._all_blank(state_alphabet, emission_alphabet) + + # the default pseudocounts for transition and emission counting + self.transition_pseudo = {} + self.emission_pseudo = self._all_pseudo(state_alphabet, emission_alphabet) + + def _all_blank(self, first_alphabet, second_alphabet): + """Return a dictionary with all counts set to zero (PRIVATE). + + This uses the letters in the first and second alphabet to create + a dictionary with keys of two tuples organized as + (letter of first alphabet, letter of second alphabet). The values + are all set to 0. + """ + all_blank = {} + for first_state in first_alphabet: + for second_state in second_alphabet: + all_blank[(first_state, second_state)] = 0 + + return all_blank + + def _all_pseudo(self, first_alphabet, second_alphabet): + """Return a dictionary with all counts set to a default value (PRIVATE). + + This takes the letters in first alphabet and second alphabet and + creates a dictionary with keys of two tuples organized as: + (letter of first alphabet, letter of second alphabet). The values + are all set to the value of the class attribute DEFAULT_PSEUDO. + """ + all_counts = {} + for first_state in first_alphabet: + for second_state in second_alphabet: + all_counts[(first_state, second_state)] = self.DEFAULT_PSEUDO + + return all_counts + + def get_markov_model(self): + """Return the markov model corresponding with the current parameters. + + Each markov model returned by a call to this function is unique + (ie. they don't influence each other). + """ + # user must set initial probabilities + if not self.initial_prob: + raise Exception( + "set_initial_probabilities must be called to " + "fully initialize the Markov model" + ) + + initial_prob = copy.deepcopy(self.initial_prob) + transition_prob = copy.deepcopy(self.transition_prob) + emission_prob = copy.deepcopy(self.emission_prob) + transition_pseudo = copy.deepcopy(self.transition_pseudo) + emission_pseudo = copy.deepcopy(self.emission_pseudo) + + return HiddenMarkovModel( + self._state_alphabet, + self._emission_alphabet, + initial_prob, + transition_prob, + emission_prob, + transition_pseudo, + emission_pseudo, + ) + + def set_initial_probabilities(self, initial_prob): + """Set initial state probabilities. + + initial_prob is a dictionary mapping states to probabilities. + Suppose, for example, that the state alphabet is ('A', 'B'). Call + set_initial_prob({'A': 1}) to guarantee that the initial + state will be 'A'. Call set_initial_prob({'A': 0.5, 'B': 0.5}) + to make each initial state equally probable. + + This method must now be called in order to use the Markov model + because the calculation of initial probabilities has changed + incompatibly; the previous calculation was incorrect. + + If initial probabilities are set for all states, then they should add up + to 1. Otherwise the sum should be <= 1. The residual probability is + divided up evenly between all the states for which the initial + probability has not been set. For example, calling + set_initial_prob({}) results in P('A') = 0.5 and P('B') = 0.5, + for the above example. + """ + self.initial_prob = copy.copy(initial_prob) + + # ensure that all referenced states are valid + for state in initial_prob: + if state not in self._state_alphabet: + raise ValueError( + "State %s was not found in the sequence alphabet" % state + ) + + # distribute the residual probability, if any + num_states_not_set = len(self._state_alphabet) - len(self.initial_prob) + if num_states_not_set < 0: + raise Exception("Initial probabilities can't exceed # of states") + prob_sum = sum(self.initial_prob.values()) + if prob_sum > 1.0: + raise Exception("Total initial probability cannot exceed 1.0") + if num_states_not_set > 0: + prob = (1.0 - prob_sum) / num_states_not_set + for state in self._state_alphabet: + if state not in self.initial_prob: + self.initial_prob[state] = prob + + def set_equal_probabilities(self): + """Reset all probabilities to be an average value. + + Resets the values of all initial probabilities and all allowed + transitions and all allowed emissions to be equal to 1 divided by the + number of possible elements. + + This is useful if you just want to initialize a Markov Model to + starting values (ie. if you have no prior notions of what the + probabilities should be -- or if you are just feeling too lazy + to calculate them :-). + + Warning 1 -- this will reset all currently set probabilities. + + Warning 2 -- This just sets all probabilities for transitions and + emissions to total up to 1, so it doesn't ensure that the sum of + each set of transitions adds up to 1. + """ + # set initial state probabilities + new_initial_prob = float(1) / float(len(self.transition_prob)) + for state in self._state_alphabet: + self.initial_prob[state] = new_initial_prob + + # set the transitions + new_trans_prob = float(1) / float(len(self.transition_prob)) + for key in self.transition_prob: + self.transition_prob[key] = new_trans_prob + + # set the emissions + new_emission_prob = float(1) / float(len(self.emission_prob)) + for key in self.emission_prob: + self.emission_prob[key] = new_emission_prob + + def set_random_initial_probabilities(self): + """Set all initial state probabilities to a randomly generated distribution. + + Returns the dictionary containing the initial probabilities. + """ + initial_freqs = _gen_random_array(len(self._state_alphabet)) + for state in self._state_alphabet: + self.initial_prob[state] = initial_freqs.pop() + + return self.initial_prob + + def set_random_transition_probabilities(self): + """Set all allowed transition probabilities to a randomly generated distribution. + + Returns the dictionary containing the transition probabilities. + """ + if not self.transition_prob: + raise Exception( + "No transitions have been allowed yet. " + "Allow some or all transitions by calling " + "allow_transition or allow_all_transitions first." + ) + + transitions_from = _calculate_from_transitions(self.transition_prob) + for from_state in transitions_from: + freqs = _gen_random_array(len(transitions_from[from_state])) + for to_state in transitions_from[from_state]: + self.transition_prob[(from_state, to_state)] = freqs.pop() + + return self.transition_prob + + def set_random_emission_probabilities(self): + """Set all allowed emission probabilities to a randomly generated distribution. + + Returns the dictionary containing the emission probabilities. + """ + if not self.emission_prob: + raise Exception( + "No emissions have been allowed yet. Allow some or all emissions." + ) + + emissions = _calculate_emissions(self.emission_prob) + for state in emissions: + freqs = _gen_random_array(len(emissions[state])) + for symbol in emissions[state]: + self.emission_prob[(state, symbol)] = freqs.pop() + + return self.emission_prob + + def set_random_probabilities(self): + """Set all probabilities to randomly generated numbers. + + Resets probabilities of all initial states, transitions, and + emissions to random values. + """ + self.set_random_initial_probabilities() + self.set_random_transition_probabilities() + self.set_random_emission_probabilities() + + # --- functions to deal with the transitions in the sequence + + def allow_all_transitions(self): + """Create transitions between all states. + + By default all transitions within the alphabet are disallowed; + this is a convenience function to change this to allow all + possible transitions. + """ + # first get all probabilities and pseudo counts set + # to the default values + all_probs = self._all_blank(self._state_alphabet, self._state_alphabet) + + all_pseudo = self._all_pseudo(self._state_alphabet, self._state_alphabet) + + # now set any probabilities and pseudo counts that + # were previously set + for set_key in self.transition_prob: + all_probs[set_key] = self.transition_prob[set_key] + + for set_key in self.transition_pseudo: + all_pseudo[set_key] = self.transition_pseudo[set_key] + + # finally reinitialize the transition probs and pseudo counts + self.transition_prob = all_probs + self.transition_pseudo = all_pseudo + + def allow_transition( + self, from_state, to_state, probability=None, pseudocount=None + ): + """Set a transition as being possible between the two states. + + probability and pseudocount are optional arguments + specifying the probabilities and pseudo counts for the transition. + If these are not supplied, then the values are set to the + default values. + + Raises: + KeyError -- if the two states already have an allowed transition. + + """ + # check the sanity of adding these states + for state in [from_state, to_state]: + if state not in self._state_alphabet: + raise ValueError( + "State %s was not found in the sequence alphabet" % state + ) + + # ensure that the states are not already set + if (from_state, to_state) not in self.transition_prob and ( + from_state, + to_state, + ) not in self.transition_pseudo: + # set the initial probability + if probability is None: + probability = 0 + self.transition_prob[(from_state, to_state)] = probability + + # set the initial pseudocounts + if pseudocount is None: + pseudocount = self.DEFAULT_PSEUDO + self.transition_pseudo[(from_state, to_state)] = pseudocount + else: + raise KeyError( + "Transition from %s to %s is already allowed." % (from_state, to_state) + ) + + def destroy_transition(self, from_state, to_state): + """Restrict transitions between the two states. + + Raises: + KeyError if the transition is not currently allowed. + + """ + try: + del self.transition_prob[(from_state, to_state)] + del self.transition_pseudo[(from_state, to_state)] + except KeyError: + raise KeyError( + "Transition from %s to %s is already disallowed." + % (from_state, to_state) + ) + + def set_transition_score(self, from_state, to_state, probability): + """Set the probability of a transition between two states. + + Raises: + KeyError if the transition is not allowed. + + """ + if (from_state, to_state) in self.transition_prob: + self.transition_prob[(from_state, to_state)] = probability + else: + raise KeyError( + "Transition from %s to %s is not allowed." % (from_state, to_state) + ) + + def set_transition_pseudocount(self, from_state, to_state, count): + """Set the default pseudocount for a transition. + + To avoid computational problems, it is helpful to be able to + set a 'default' pseudocount to start with for estimating + transition and emission probabilities (see p62 in Durbin et al + for more discussion on this. By default, all transitions have + a pseudocount of 1. + + Raises: + KeyError if the transition is not allowed. + + """ + if (from_state, to_state) in self.transition_pseudo: + self.transition_pseudo[(from_state, to_state)] = count + else: + raise KeyError( + "Transition from %s to %s is not allowed." % (from_state, to_state) + ) + + # --- functions to deal with emissions from the sequence + + def set_emission_score(self, seq_state, emission_state, probability): + """Set the probability of a emission from a particular state. + + Raises: + KeyError if the emission from the given state is not allowed. + + """ + if (seq_state, emission_state) in self.emission_prob: + self.emission_prob[(seq_state, emission_state)] = probability + else: + raise KeyError( + "Emission of %s from %s is not allowed." % (emission_state, seq_state) + ) + + def set_emission_pseudocount(self, seq_state, emission_state, count): + """Set the default pseudocount for an emission. + + To avoid computational problems, it is helpful to be able to + set a 'default' pseudocount to start with for estimating + transition and emission probabilities (see p62 in Durbin et al + for more discussion on this. By default, all emissions have + a pseudocount of 1. + + Raises: + KeyError if the emission from the given state is not allowed. + + """ + if (seq_state, emission_state) in self.emission_pseudo: + self.emission_pseudo[(seq_state, emission_state)] = count + else: + raise KeyError( + "Emission of %s from %s is not allowed." % (emission_state, seq_state) + ) + + +class HiddenMarkovModel: + """Represent a hidden markov model that can be used for state estimation.""" + + def __init__( + self, + state_alphabet, + emission_alphabet, + initial_prob, + transition_prob, + emission_prob, + transition_pseudo, + emission_pseudo, + ): + """Initialize a Markov Model. + + Note: You should use the MarkovModelBuilder class instead of + initiating this class directly. + + Arguments: + - state_alphabet -- A tuple containing all of the letters that can + appear in the states. + - emission_alphabet -- A tuple containing all of the letters for + states that can be emitted by the HMM. + - initial_prob - A dictionary of initial probabilities for all states. + - transition_prob -- A dictionary of transition probabilities for all + possible transitions in the sequence. + - emission_prob -- A dictionary of emission probabilities for all + possible emissions from the sequence states. + - transition_pseudo -- Pseudo-counts to be used for the transitions, + when counting for purposes of estimating transition probabilities. + - emission_pseudo -- Pseudo-counts to be used for the emissions, + when counting for purposes of estimating emission probabilities. + + """ + self.state_alphabet = state_alphabet + self.emission_alphabet = emission_alphabet + + self.initial_prob = initial_prob + + self._transition_pseudo = transition_pseudo + self._emission_pseudo = emission_pseudo + + self.transition_prob = transition_prob + self.emission_prob = emission_prob + + # a dictionary of the possible transitions from each state + # each key is a source state, mapped to a list of the destination states + # that are reachable from the source state via a transition + self._transitions_from = _calculate_from_transitions(self.transition_prob) + + # a dictionary of the possible transitions to each state + # each key is a destination state, mapped to a list of source states + # from which the destination is reachable via a transition + self._transitions_to = _calculate_to_transitions(self.transition_prob) + + def get_blank_transitions(self): + """Get the default transitions for the model. + + Returns a dictionary of all of the default transitions between any + two letters in the sequence alphabet. The dictionary is structured + with keys as (letter1, letter2) and values as the starting number + of transitions. + """ + return self._transition_pseudo + + def get_blank_emissions(self): + """Get the starting default emmissions for each sequence. + + This returns a dictionary of the default emmissions for each + letter. The dictionary is structured with keys as + (seq_letter, emmission_letter) and values as the starting number + of emmissions. + """ + return self._emission_pseudo + + def transitions_from(self, state_letter): + """Get all destination states which can transition from source state_letter. + + This returns all letters which the given state_letter can transition + to, i.e. all the destination states reachable from state_letter. + + An empty list is returned if state_letter has no outgoing transitions. + """ + if state_letter in self._transitions_from: + return self._transitions_from[state_letter] + else: + return [] + + def transitions_to(self, state_letter): + """Get all source states which can transition to destination state_letter. + + This returns all letters which the given state_letter is reachable + from, i.e. all the source states which can reach state_later + + An empty list is returned if state_letter is unreachable. + """ + if state_letter in self._transitions_to: + return self._transitions_to[state_letter] + else: + return [] + + def viterbi(self, sequence, state_alphabet): + """Calculate the most probable state path using the Viterbi algorithm. + + This implements the Viterbi algorithm (see pgs 55-57 in Durbin et + al for a full explanation -- this is where I took my implementation + ideas from), to allow decoding of the state path, given a sequence + of emissions. + + Arguments: + - sequence -- A Seq object with the emission sequence that we + want to decode. + - state_alphabet -- An iterable (e.g., tuple or list) containing + all of the letters that can appear in the states + + """ + # calculate logarithms of the initial, transition, and emission probs + log_initial = self._log_transform(self.initial_prob) + log_trans = self._log_transform(self.transition_prob) + log_emission = self._log_transform(self.emission_prob) + + viterbi_probs = {} + pred_state_seq = {} + + # --- recursion + # loop over the training squence (i = 1 .. L) + # NOTE: My index numbers are one less than what is given in Durbin + # et al, since we are indexing the sequence going from 0 to + # (Length - 1) not 1 to Length, like in Durbin et al. + for i in range(0, len(sequence)): + # loop over all of the possible i-th states in the state path + for cur_state in state_alphabet: + # e_{l}(x_{i}) + emission_part = log_emission[(cur_state, sequence[i])] + + max_prob = 0 + if i == 0: + # for the first state, use the initial probability rather + # than looking back to previous states + max_prob = log_initial[cur_state] + else: + # loop over all possible (i-1)-th previous states + possible_state_probs = {} + for prev_state in self.transitions_to(cur_state): + # a_{kl} + trans_part = log_trans[(prev_state, cur_state)] + + # v_{k}(i - 1) + viterbi_part = viterbi_probs[(prev_state, i - 1)] + cur_prob = viterbi_part + trans_part + + possible_state_probs[prev_state] = cur_prob + + # calculate the viterbi probability using the max + max_prob = max(possible_state_probs.values()) + + # v_{k}(i) + viterbi_probs[(cur_state, i)] = emission_part + max_prob + + if i > 0: + # get the most likely prev_state leading to cur_state + for state in possible_state_probs: + if possible_state_probs[state] == max_prob: + pred_state_seq[(i - 1, cur_state)] = state + break + + # --- termination + # calculate the probability of the state path + # loop over all states + all_probs = {} + for state in state_alphabet: + # v_{k}(L) + all_probs[state] = viterbi_probs[(state, len(sequence) - 1)] + + state_path_prob = max(all_probs.values()) + + # find the last pointer we need to trace back from + last_state = "" + for state in all_probs: + if all_probs[state] == state_path_prob: + last_state = state + + assert last_state != "", "Didn't find the last state to trace from!" + + # --- traceback + traceback_seq = [] + + loop_seq = list(range(1, len(sequence))) + loop_seq.reverse() + + # last_state is the last state in the most probable state sequence. + # Compute that sequence by walking backwards in time. From the i-th + # state in the sequence, find the (i-1)-th state as the most + # probable state preceding the i-th state. + state = last_state + traceback_seq.append(state) + for i in loop_seq: + state = pred_state_seq[(i - 1, state)] + traceback_seq.append(state) + + # put the traceback sequence in the proper orientation + traceback_seq.reverse() + traceback_seq = "".join(traceback_seq) + + return Seq(traceback_seq), state_path_prob + + def _log_transform(self, probability): + """Return log transform of the given probability dictionary (PRIVATE). + + When calculating the Viterbi equation, add logs of probabilities rather + than multiplying probabilities, to avoid underflow errors. This method + returns a new dictionary with the same keys as the given dictionary + and log-transformed values. + """ + log_prob = copy.copy(probability) + for key in log_prob: + prob = log_prob[key] + if prob > 0: + log_prob[key] = math.log(log_prob[key]) + else: + log_prob[key] = -math.inf + + return log_prob diff --git a/code/lib/Bio/HMM/Trainer.py b/code/lib/Bio/HMM/Trainer.py new file mode 100644 index 0000000..98e3703 --- /dev/null +++ b/code/lib/Bio/HMM/Trainer.py @@ -0,0 +1,430 @@ +# Copyright 2001 Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Provide trainers which estimate parameters based on training sequences. + +These should be used to 'train' a Markov Model prior to actually using +it to decode state paths. When supplied training sequences and a model +to work from, these classes will estimate parameters of the model. + +This aims to estimate two parameters: + +- a_{kl} -- the number of times there is a transition from k to l in the + training data. +- e_{k}(b) -- the number of emissions of the state b from the letter k + in the training data. + +""" +# standard modules +import math + +# local stuff +from .DynamicProgramming import ScaledDPAlgorithms + + +class TrainingSequence: + """Hold a training sequence with emissions and optionally, a state path.""" + + def __init__(self, emissions, state_path): + """Initialize a training sequence. + + Arguments: + - emissions - An iterable (e.g., a tuple, list, or Seq object) + containing the sequence of emissions in the training sequence. + - state_path - An iterable (e.g., a tuple or list) containing the + sequence of states. If there is no known state path, then the + sequence of states should be an empty iterable. + + """ + if len(state_path) > 0 and len(emissions) != len(state_path): + raise ValueError("State path does not match associated emissions.") + self.emissions = emissions + self.states = state_path + + +class AbstractTrainer: + """Provide generic functionality needed in all trainers.""" + + def __init__(self, markov_model): + """Initialize the class.""" + self._markov_model = markov_model + + def log_likelihood(self, probabilities): + """Calculate the log likelihood of the training seqs. + + Arguments: + - probabilities -- A list of the probabilities of each training + sequence under the current parameters, calculated using the + forward algorithm. + + """ + total_likelihood = 0 + for probability in probabilities: + total_likelihood += math.log(probability) + + return total_likelihood + + def estimate_params(self, transition_counts, emission_counts): + """Get a maximum likelihood estimation of transition and emmission. + + Arguments: + - transition_counts -- A dictionary with the total number of counts + of transitions between two states. + - emissions_counts -- A dictionary with the total number of counts + of emmissions of a particular emission letter by a state letter. + + This then returns the maximum likelihood estimators for the + transitions and emissions, estimated by formulas 3.18 in + Durbin et al:: + + a_{kl} = A_{kl} / sum(A_{kl'}) + e_{k}(b) = E_{k}(b) / sum(E_{k}(b')) + + Returns: + Transition and emission dictionaries containing the maximum + likelihood estimators. + + """ + # now calculate the information + ml_transitions = self.ml_estimator(transition_counts) + ml_emissions = self.ml_estimator(emission_counts) + + return ml_transitions, ml_emissions + + def ml_estimator(self, counts): + """Calculate the maximum likelihood estimator. + + This can calculate maximum likelihoods for both transitions + and emissions. + + Arguments: + - counts -- A dictionary of the counts for each item. + + See estimate_params for a description of the formula used for + calculation. + + """ + # get an ordered list of all items + all_ordered = sorted(counts) + + ml_estimation = {} + + # the total counts for the current letter we are on + cur_letter = None + cur_letter_counts = 0 + + for cur_item in all_ordered: + # if we are on a new letter (ie. the first letter of the tuple) + if cur_item[0] != cur_letter: + # set the new letter we are working with + cur_letter = cur_item[0] + + # count up the total counts for this letter + cur_letter_counts = counts[cur_item] + + # add counts for all other items with the same first letter + cur_position = all_ordered.index(cur_item) + 1 + + # keep adding while we have the same first letter or until + # we get to the end of the ordered list + while ( + cur_position < len(all_ordered) + and all_ordered[cur_position][0] == cur_item[0] + ): + cur_letter_counts += counts[all_ordered[cur_position]] + cur_position += 1 + # otherwise we've already got the total counts for this letter + else: + pass + + # now calculate the ml and add it to the estimation + cur_ml = float(counts[cur_item]) / float(cur_letter_counts) + ml_estimation[cur_item] = cur_ml + + return ml_estimation + + +class BaumWelchTrainer(AbstractTrainer): + """Trainer that uses the Baum-Welch algorithm to estimate parameters. + + These should be used when a training sequence for an HMM has unknown + paths for the actual states, and you need to make an estimation of the + model parameters from the observed emissions. + + This uses the Baum-Welch algorithm, first described in + Baum, L.E. 1972. Inequalities. 3:1-8 + This is based on the description in 'Biological Sequence Analysis' by + Durbin et al. in section 3.3 + + This algorithm is guaranteed to converge to a local maximum, but not + necessarily to the global maxima, so use with care! + """ + + def __init__(self, markov_model): + """Initialize the trainer. + + Arguments: + - markov_model - The model we are going to estimate parameters for. + This should have the parameters with some initial estimates, that + we can build from. + + """ + AbstractTrainer.__init__(self, markov_model) + + def train(self, training_seqs, stopping_criteria, dp_method=ScaledDPAlgorithms): + """Estimate the parameters using training sequences. + + The algorithm for this is taken from Durbin et al. p64, so this + is a good place to go for a reference on what is going on. + + Arguments: + - training_seqs -- A list of TrainingSequence objects to be used + for estimating the parameters. + - stopping_criteria -- A function, that when passed the change + in log likelihood and threshold, will indicate if we should stop + the estimation iterations. + - dp_method -- A class instance specifying the dynamic programming + implementation we should use to calculate the forward and + backward variables. By default, we use the scaling method. + + """ + prev_log_likelihood = None + num_iterations = 1 + + while True: + transition_count = self._markov_model.get_blank_transitions() + emission_count = self._markov_model.get_blank_emissions() + + # remember all of the sequence probabilities + all_probabilities = [] + + for training_seq in training_seqs: + # calculate the forward and backward variables + DP = dp_method(self._markov_model, training_seq) + forward_var, seq_prob = DP.forward_algorithm() + backward_var = DP.backward_algorithm() + + all_probabilities.append(seq_prob) + + # update the counts for transitions and emissions + transition_count = self.update_transitions( + transition_count, training_seq, forward_var, backward_var, seq_prob + ) + emission_count = self.update_emissions( + emission_count, training_seq, forward_var, backward_var, seq_prob + ) + + # update the markov model with the new probabilities + ml_transitions, ml_emissions = self.estimate_params( + transition_count, emission_count + ) + self._markov_model.transition_prob = ml_transitions + self._markov_model.emission_prob = ml_emissions + + cur_log_likelihood = self.log_likelihood(all_probabilities) + + # if we have previously calculated the log likelihood (ie. + # not the first round), see if we can finish + if prev_log_likelihood is not None: + # XXX log likelihoods are negatives -- am I calculating + # the change properly, or should I use the negatives... + # I'm not sure at all if this is right. + log_likelihood_change = abs( + abs(cur_log_likelihood) - abs(prev_log_likelihood) + ) + + # check whether we have completed enough iterations to have + # a good estimation + if stopping_criteria(log_likelihood_change, num_iterations): + break + + # set up for another round of iterations + prev_log_likelihood = cur_log_likelihood + num_iterations += 1 + + return self._markov_model + + def update_transitions( + self, + transition_counts, + training_seq, + forward_vars, + backward_vars, + training_seq_prob, + ): + """Add the contribution of a new training sequence to the transitions. + + Arguments: + - transition_counts -- A dictionary of the current counts for the + transitions + - training_seq -- The training sequence we are working with + - forward_vars -- Probabilities calculated using the forward + algorithm. + - backward_vars -- Probabilities calculated using the backwards + algorithm. + - training_seq_prob - The probability of the current sequence. + + This calculates A_{kl} (the estimated transition counts from state + k to state l) using formula 3.20 in Durbin et al. + + """ + # set up the transition and emission probabilities we are using + transitions = self._markov_model.transition_prob + emissions = self._markov_model.emission_prob + + # loop over the possible combinations of state path letters + for k in self._markov_model.state_alphabet: + for l in self._markov_model.transitions_from(k): + estimated_counts = 0 + # now loop over the entire training sequence + for i in range(len(training_seq.emissions) - 1): + # the forward value of k at the current position + forward_value = forward_vars[(k, i)] + + # the backward value of l in the next position + backward_value = backward_vars[(l, i + 1)] + + # the probability of a transition from k to l + trans_value = transitions[(k, l)] + + # the probability of getting the emission at the next pos + emm_value = emissions[(l, training_seq.emissions[i + 1])] + + estimated_counts += ( + forward_value * trans_value * emm_value * backward_value + ) + + # update the transition approximation + transition_counts[(k, l)] += float(estimated_counts) / training_seq_prob + + return transition_counts + + def update_emissions( + self, + emission_counts, + training_seq, + forward_vars, + backward_vars, + training_seq_prob, + ): + """Add the contribution of a new training sequence to the emissions. + + Arguments: + - emission_counts -- A dictionary of the current counts for the + emissions + - training_seq -- The training sequence we are working with + - forward_vars -- Probabilities calculated using the forward + algorithm. + - backward_vars -- Probabilities calculated using the backwards + algorithm. + - training_seq_prob - The probability of the current sequence. + + This calculates E_{k}(b) (the estimated emission probability for + emission letter b from state k) using formula 3.21 in Durbin et al. + + """ + # loop over the possible combinations of state path letters + for k in self._markov_model.state_alphabet: + # now loop over all of the possible emissions + for b in self._markov_model.emission_alphabet: + expected_times = 0 + # finally loop over the entire training sequence + for i in range(len(training_seq.emissions)): + # only count the forward and backward probability if the + # emission at the position is the same as b + if training_seq.emissions[i] == b: + # f_{k}(i) b_{k}(i) + expected_times += forward_vars[(k, i)] * backward_vars[(k, i)] + + # add to E_{k}(b) + emission_counts[(k, b)] += float(expected_times) / training_seq_prob + + return emission_counts + + +class KnownStateTrainer(AbstractTrainer): + """Estimate probabilities with known state sequences. + + This should be used for direct estimation of emission and transition + probabilities when both the state path and emission sequence are + known for the training examples. + """ + + def __init__(self, markov_model): + """Initialize the class.""" + AbstractTrainer.__init__(self, markov_model) + + def train(self, training_seqs): + """Estimate the Markov Model parameters with known state paths. + + This trainer requires that both the state and the emissions are + known for all of the training sequences in the list of + TrainingSequence objects. + This training will then count all of the transitions and emissions, + and use this to estimate the parameters of the model. + """ + # count up all of the transitions and emissions + transition_counts = self._markov_model.get_blank_transitions() + emission_counts = self._markov_model.get_blank_emissions() + + for training_seq in training_seqs: + emission_counts = self._count_emissions(training_seq, emission_counts) + transition_counts = self._count_transitions( + training_seq.states, transition_counts + ) + + # update the markov model from the counts + ml_transitions, ml_emissions = self.estimate_params( + transition_counts, emission_counts + ) + self._markov_model.transition_prob = ml_transitions + self._markov_model.emission_prob = ml_emissions + + return self._markov_model + + def _count_emissions(self, training_seq, emission_counts): + """Add emissions from the training sequence to the current counts (PRIVATE). + + Arguments: + - training_seq -- A TrainingSequence with states and emissions + to get the counts from + - emission_counts -- The current emission counts to add to. + + """ + for index in range(len(training_seq.emissions)): + cur_state = training_seq.states[index] + cur_emission = training_seq.emissions[index] + + try: + emission_counts[(cur_state, cur_emission)] += 1 + except KeyError: + raise KeyError( + "Unexpected emission (%s, %s)" % (cur_state, cur_emission) + ) + return emission_counts + + def _count_transitions(self, state_seq, transition_counts): + """Add transitions from the training sequence to the current counts (PRIVATE). + + Arguments: + - state_seq -- A Seq object with the states of the current training + sequence. + - transition_counts -- The current transition counts to add to. + + """ + for cur_pos in range(len(state_seq) - 1): + cur_state = state_seq[cur_pos] + next_state = state_seq[cur_pos + 1] + + try: + transition_counts[(cur_state, next_state)] += 1 + except KeyError: + raise KeyError( + "Unexpected transition (%s, %s)" % (cur_state, next_state) + ) + + return transition_counts diff --git a/code/lib/Bio/HMM/Utilities.py b/code/lib/Bio/HMM/Utilities.py new file mode 100644 index 0000000..61d3b37 --- /dev/null +++ b/code/lib/Bio/HMM/Utilities.py @@ -0,0 +1,68 @@ +# Copyright 2001 Brad Chapman. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Generic functions which are useful for working with HMMs. + +This just collects general functions which you might like to use in +dealing with HMMs. +""" + + +def pretty_print_prediction( + emissions, + real_state, + predicted_state, + emission_title="Emissions", + real_title="Real State", + predicted_title="Predicted State", + line_width=75, +): + """Print out a state sequence prediction in a nice manner. + + Arguments: + - emissions -- The sequence of emissions of the sequence you are + dealing with. + - real_state -- The actual state path that generated the emissions. + - predicted_state -- A state path predicted by some kind of HMM model. + + """ + # calculate the length of the titles and sequences + title_length = max(len(emission_title), len(real_title), len(predicted_title)) + 1 + seq_length = line_width - title_length + + # set up the titles so they'll print right + emission_title = emission_title.ljust(title_length) + real_title = real_title.ljust(title_length) + predicted_title = predicted_title.ljust(title_length) + + cur_position = 0 + # while we still have more than seq_length characters to print + while True: + if (cur_position + seq_length) < len(emissions): + extension = seq_length + else: + extension = len(emissions) - cur_position + + print( + "%s%s" + % (emission_title, emissions[cur_position : cur_position + seq_length]) + ) + print( + "%s%s" % (real_title, real_state[cur_position : cur_position + seq_length]) + ) + print( + "%s%s\n" + % ( + predicted_title, + predicted_state[cur_position : cur_position + seq_length], + ) + ) + + if len(emissions) < (cur_position + seq_length): + break + + cur_position += seq_length diff --git a/code/lib/Bio/HMM/__init__.py b/code/lib/Bio/HMM/__init__.py new file mode 100644 index 0000000..a477108 --- /dev/null +++ b/code/lib/Bio/HMM/__init__.py @@ -0,0 +1,5 @@ +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""A selection of Hidden Markov Model code.""" diff --git a/code/lib/Bio/HMM/__pycache__/DynamicProgramming.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/DynamicProgramming.cpython-37.pyc new file mode 100644 index 0000000..b9f86c9 Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/DynamicProgramming.cpython-37.pyc differ diff --git a/code/lib/Bio/HMM/__pycache__/MarkovModel.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/MarkovModel.cpython-37.pyc new file mode 100644 index 0000000..3d527bd Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/MarkovModel.cpython-37.pyc differ diff --git a/code/lib/Bio/HMM/__pycache__/Trainer.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/Trainer.cpython-37.pyc new file mode 100644 index 0000000..89eb7b5 Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/Trainer.cpython-37.pyc differ diff --git a/code/lib/Bio/HMM/__pycache__/Utilities.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/Utilities.cpython-37.pyc new file mode 100644 index 0000000..6a1bd24 Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/Utilities.cpython-37.pyc differ diff --git a/code/lib/Bio/HMM/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..50bf067 Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/KEGG/Compound/__init__.py b/code/lib/Bio/KEGG/Compound/__init__.py new file mode 100644 index 0000000..000291a --- /dev/null +++ b/code/lib/Bio/KEGG/Compound/__init__.py @@ -0,0 +1,175 @@ +# Copyright 2001 by Tarjei Mikkelsen. All rights reserved. +# Copyright 2007 by Michiel de Hoon. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Code to work with the KEGG Ligand/Compound database. + +Functions: + - parse - Returns an iterator giving Record objects. + +Classes: + - Record - A representation of a KEGG Ligand/Compound. +""" + + +from Bio.KEGG import _default_wrap, _struct_wrap, _wrap_kegg, _write_kegg + + +# Set up line wrapping rules (see Bio.KEGG._wrap_kegg) +name_wrap = [0, "", (" ", "$", 1, 1), ("-", "$", 1, 1)] +id_wrap = _default_wrap +struct_wrap = _struct_wrap + + +class Record: + """Holds info from a KEGG Ligand/Compound record. + + Attributes: + - entry The entry identifier. + - name A list of the compund names. + - formula The chemical formula for the compound + - mass The molecular weight for the compound + - pathway A list of 3-tuples: ('PATH', pathway id, pathway) + - enzyme A list of the EC numbers. + - structures A list of 2-tuples: (database, list of struct ids) + - dblinks A list of 2-tuples: (database, list of link ids) + + """ + + def __init__(self): + """Initialize as new record.""" + self.entry = "" + self.name = [] + self.formula = "" + self.mass = "" + self.pathway = [] + self.enzyme = [] + self.structures = [] + self.dblinks = [] + + def __str__(self): + """Return a string representation of this Record.""" + return ( + self._entry() + + self._name() + + self._formula() + + self._mass() + + self._pathway() + + self._enzyme() + + self._structures() + + self._dblinks() + + "///" + ) + + def _entry(self): + return _write_kegg("ENTRY", [self.entry]) + + def _name(self): + return _write_kegg( + "NAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.name] + ) + + def _formula(self): + return _write_kegg("FORMULA", [self.formula]) + + def _mass(self): + return _write_kegg("MASS", [self.mass]) + + def _pathway(self): + s = [] + for entry in self.pathway: + s.append(entry[0] + " " + entry[1]) + return _write_kegg("PATHWAY", [_wrap_kegg(l, wrap_rule=id_wrap(16)) for l in s]) + + def _enzyme(self): + return _write_kegg( + "ENZYME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.enzyme] + ) + + def _structures(self): + s = [] + for entry in self.structures: + s.append(entry[0] + ": " + " ".join(entry[1]) + " ") + return _write_kegg( + "STRUCTURES", [_wrap_kegg(l, wrap_rule=struct_wrap(5)) for l in s] + ) + + def _dblinks(self): + s = [] + for entry in self.dblinks: + s.append(entry[0] + ": " + " ".join(entry[1])) + return _write_kegg("DBLINKS", [_wrap_kegg(l, wrap_rule=id_wrap(9)) for l in s]) + + +def parse(handle): + """Parse a KEGG Ligan/Compound file, returning Record objects. + + This is an iterator function, typically used in a for loop. For + example, using one of the example KEGG files in the Biopython + test suite, + + >>> with open("KEGG/compound.sample") as handle: + ... for record in parse(handle): + ... print("%s %s" % (record.entry, record.name[0])) + ... + C00023 Iron + C00017 Protein + C00099 beta-Alanine + C00294 Inosine + C00298 Trypsin + C00348 all-trans-Undecaprenyl phosphate + C00349 2-Methyl-3-oxopropanoate + C01386 NH2Mec + + """ + record = Record() + for line in handle: + if line[:3] == "///": + yield record + record = Record() + continue + if line[:12] != " ": + keyword = line[:12] + data = line[12:].strip() + if keyword == "ENTRY ": + words = data.split() + record.entry = words[0] + elif keyword == "NAME ": + data = data.strip(";") + record.name.append(data) + elif keyword == "ENZYME ": + while data: + column = data[:16] + data = data[16:] + enzyme = column.strip() + record.enzyme.append(enzyme) + elif keyword == "PATHWAY ": + map, name = data.split(" ") + pathway = ("PATH", map, name) + record.pathway.append(pathway) + elif keyword == "FORMULA ": + record.formula = data + elif keyword == "MASS ": + record.mass = data + elif keyword == "DBLINKS ": + if ":" in data: + key, values = data.split(":") + values = values.split() + row = (key, values) + record.dblinks.append(row) + else: + row = record.dblinks[-1] + key, values = row + values.extend(data.split()) + row = key, values + record.dblinks[-1] = row + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/KEGG/Compound/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/Compound/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..7d9b138 Binary files /dev/null and b/code/lib/Bio/KEGG/Compound/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/KEGG/Enzyme/__init__.py b/code/lib/Bio/KEGG/Enzyme/__init__.py new file mode 100644 index 0000000..bb5bb7c --- /dev/null +++ b/code/lib/Bio/KEGG/Enzyme/__init__.py @@ -0,0 +1,328 @@ +# Copyright 2001 by Tarjei Mikkelsen. All rights reserved. +# Copyright 2007 by Michiel de Hoon. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code to work with the KEGG Enzyme database. + +Functions: + - parse - Returns an iterator giving Record objects. + +Classes: + - Record - Holds the information from a KEGG Enzyme record. +""" + + +from Bio.KEGG import _default_wrap, _struct_wrap, _wrap_kegg, _write_kegg + + +# Set up line wrapping rules (see Bio.KEGG._wrap_kegg) +rxn_wrap = [ + 0, + "", + (" + ", "", 1, 1), + (" = ", "", 1, 1), + (" ", "$", 1, 1), + ("-", "$", 1, 1), +] +name_wrap = [0, "", (" ", "$", 1, 1), ("-", "$", 1, 1)] +id_wrap = _default_wrap +struct_wrap = _struct_wrap + + +class Record: + """Holds info from a KEGG Enzyme record. + + Attributes: + - entry The EC number (withou the 'EC '). + - name A list of the enzyme names. + - classname A list of the classification terms. + - sysname The systematic name of the enzyme. + - reaction A list of the reaction description strings. + - substrate A list of the substrates. + - product A list of the products. + - inhibitor A list of the inhibitors. + - cofactor A list of the cofactors. + - effector A list of the effectors. + - comment A list of the comment strings. + - pathway A list of 3-tuples: (database, id, pathway) + - genes A list of 2-tuples: (organism, list of gene ids) + - disease A list of 3-tuples: (database, id, disease) + - structures A list of 2-tuples: (database, list of struct ids) + - dblinks A list of 2-tuples: (database, list of db ids) + + """ + + def __init__(self): + """Initialize a new Record.""" + self.entry = "" + self.name = [] + self.classname = [] + self.sysname = [] + self.reaction = [] + self.substrate = [] + self.product = [] + self.inhibitor = [] + self.cofactor = [] + self.effector = [] + self.comment = [] + self.pathway = [] + self.genes = [] + self.disease = [] + self.structures = [] + self.dblinks = [] + + def __str__(self): + """Return a string representation of this Record.""" + return ( + self._entry() + + self._name() + + self._classname() + + self._sysname() + + self._reaction() + + self._substrate() + + self._product() + + self._inhibitor() + + self._cofactor() + + self._effector() + + self._comment() + + self._pathway() + + self._genes() + + self._disease() + + self._structures() + + self._dblinks() + + "///" + ) + + def _entry(self): + return _write_kegg("ENTRY", ["EC " + self.entry]) + + def _name(self): + return _write_kegg( + "NAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.name] + ) + + def _classname(self): + return _write_kegg("CLASS", self.classname) + + def _sysname(self): + return _write_kegg( + "SYSNAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.sysname] + ) + + def _reaction(self): + return _write_kegg( + "REACTION", [_wrap_kegg(l, wrap_rule=rxn_wrap) for l in self.reaction] + ) + + def _substrate(self): + return _write_kegg( + "SUBSTRATE", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.substrate] + ) + + def _product(self): + return _write_kegg( + "PRODUCT", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.product] + ) + + def _inhibitor(self): + return _write_kegg( + "INHIBITOR", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.inhibitor] + ) + + def _cofactor(self): + return _write_kegg( + "COFACTOR", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.cofactor] + ) + + def _effector(self): + return _write_kegg( + "EFFECTOR", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.effector] + ) + + def _comment(self): + return _write_kegg( + "COMMENT", [_wrap_kegg(l, wrap_rule=id_wrap(0)) for l in self.comment] + ) + + def _pathway(self): + s = [] + for entry in self.pathway: + s.append(entry[0] + ": " + entry[1] + " " + entry[2]) + return _write_kegg("PATHWAY", [_wrap_kegg(l, wrap_rule=id_wrap(16)) for l in s]) + + def _genes(self): + s = [] + for entry in self.genes: + s.append(entry[0] + ": " + " ".join(entry[1])) + return _write_kegg("GENES", [_wrap_kegg(l, wrap_rule=id_wrap(5)) for l in s]) + + def _disease(self): + s = [] + for entry in self.disease: + s.append(entry[0] + ": " + entry[1] + " " + entry[2]) + return _write_kegg("DISEASE", [_wrap_kegg(l, wrap_rule=id_wrap(13)) for l in s]) + + def _structures(self): + s = [] + for entry in self.structures: + s.append(entry[0] + ": " + " ".join(entry[1]) + " ") + return _write_kegg( + "STRUCTURES", [_wrap_kegg(l, wrap_rule=struct_wrap(5)) for l in s] + ) + + def _dblinks(self): + # This is a bit of a cheat that won't work if enzyme entries + # have more than one link id per db id. For now, that's not + # the case - storing links ids in a list is only to make + # this class similar to the Compound.Record class. + s = [] + for entry in self.dblinks: + s.append(entry[0] + ": " + " ".join(entry[1])) + return _write_kegg("DBLINKS", s) + + +def parse(handle): + """Parse a KEGG Enzyme file, returning Record objects. + + This is an iterator function, typically used in a for loop. For + example, using one of the example KEGG files in the Biopython + test suite, + + >>> with open("KEGG/enzyme.sample") as handle: + ... for record in parse(handle): + ... print("%s %s" % (record.entry, record.name[0])) + ... + 1.1.1.1 alcohol dehydrogenase + 1.1.1.62 17beta-estradiol 17-dehydrogenase + 1.1.1.68 Transferred to 1.5.1.20 + 1.6.5.3 NADH:ubiquinone reductase (H+-translocating) + 1.14.13.28 3,9-dihydroxypterocarpan 6a-monooxygenase + 2.4.1.68 glycoprotein 6-alpha-L-fucosyltransferase + 3.1.1.6 acetylesterase + 2.7.2.1 acetate kinase + + """ + record = Record() + for line in handle: + if line[:3] == "///": + yield record + record = Record() + continue + if line[:12] != " ": + keyword = line[:12] + data = line[12:].strip() + if keyword == "ENTRY ": + words = data.split() + record.entry = words[1] + elif keyword == "CLASS ": + record.classname.append(data) + elif keyword == "COFACTOR ": + record.cofactor.append(data) + elif keyword == "COMMENT ": + record.comment.append(data) + elif keyword == "DBLINKS ": + if ":" in data: + key, values = data.split(":") + values = values.split() + row = (key, values) + record.dblinks.append(row) + else: + row = record.dblinks[-1] + key, values = row + values.extend(data.split()) + row = key, values + record.dblinks[-1] = row + elif keyword == "DISEASE ": + if ":" in data: + database, data = data.split(":") + number, name = data.split(None, 1) + row = (database, number, name) + record.disease.append(row) + else: + row = record.disease[-1] + database, number, name = row + name = name + " " + data + row = database, number, name + record.disease[-1] = row + elif keyword == "EFFECTOR ": + record.effector.append(data.strip(";")) + elif keyword == "GENES ": + if data[3:5] == ": " or data[4:6] == ": ": + key, values = data.split(":", 1) + values = [value.split("(")[0] for value in values.split()] + row = (key, values) + record.genes.append(row) + else: + row = record.genes[-1] + key, values = row + for value in data.split(): + value = value.split("(")[0] + values.append(value) + row = key, values + record.genes[-1] = row + elif keyword == "INHIBITOR ": + record.inhibitor.append(data.strip(";")) + elif keyword == "NAME ": + record.name.append(data.strip(";")) + elif keyword == "PATHWAY ": + if data[:5] == "PATH:": + _, map_num, name = data.split(None, 2) + pathway = ("PATH", map_num, name) + record.pathway.append(pathway) + else: + ec_num, name = data.split(None, 1) + pathway = "PATH", ec_num, name + record.pathway.append(pathway) + elif keyword == "PRODUCT ": + record.product.append(data.strip(";")) + elif keyword == "REACTION ": + record.reaction.append(data.strip(";")) + elif keyword == "STRUCTURES ": + if data[:4] == "PDB:": + database = data[:3] + accessions = data[4:].split() + row = (database, accessions) + record.structures.append(row) + else: + row = record.structures[-1] + database, accessions = row + accessions.extend(data.split()) + row = (database, accessions) + record.structures[-1] = row + elif keyword == "SUBSTRATE ": + record.substrate.append(data.strip(";")) + elif keyword == "SYSNAME ": + record.sysname.append(data.strip(";")) + + +def read(handle): + """Parse a KEGG Enzyme file with exactly one entry. + + If the handle contains no records, or more than one record, + an exception is raised. For example: + + >>> with open("KEGG/enzyme.new") as handle: + ... record = read(handle) + ... print("%s %s" % (record.entry, record.name[0])) + ... + 6.2.1.25 benzoate---CoA ligase + """ + records = parse(handle) + try: + record = next(records) + except StopIteration: + raise ValueError("No records found in handle") from None + try: + next(records) + raise ValueError("More than one record found in handle") + except StopIteration: + pass + return record + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/KEGG/Enzyme/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/Enzyme/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..b6c2c5f Binary files /dev/null and b/code/lib/Bio/KEGG/Enzyme/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/KEGG/Gene/__init__.py b/code/lib/Bio/KEGG/Gene/__init__.py new file mode 100644 index 0000000..8ffe5c2 --- /dev/null +++ b/code/lib/Bio/KEGG/Gene/__init__.py @@ -0,0 +1,140 @@ +# Copyright 2017 by Kozo Nishida. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Code to work with the KEGG Gene database. + +Functions: +- parse - Returns an iterator giving Record objects. + +Classes: +- Record - A representation of a KEGG Gene. + +""" + + +from Bio.KEGG import _default_wrap, _wrap_kegg, _write_kegg + + +# Set up line wrapping rules (see Bio.KEGG._wrap_kegg) +name_wrap = [0, "", (" ", "$", 1, 1), ("-", "$", 1, 1)] +id_wrap = _default_wrap + + +class Record: + """Holds info from a KEGG Gene record. + + Attributes: + - entry The entry identifier. + - name A list of the gene names. + - definition The definition for the gene. + - orthology A list of 2-tuples: (orthology id, role) + - organism A tuple: (organism id, organism) + - position The position for the gene + - motif A list of 2-tuples: (database, list of link ids) + - dblinks A list of 2-tuples: (database, list of link ids) + + """ + + def __init__(self): + """Initialize new record.""" + self.entry = "" + self.name = [] + self.definition = "" + self.orthology = [] + self.organism = "" + self.position = "" + self.motif = [] + self.dblinks = [] + + def __str__(self): + """Return a string representation of this Record.""" + return self._entry() + self._name() + self._dblinks() + "///" + + def _entry(self): + return _write_kegg("ENTRY", [self.entry]) + + def _name(self): + return _write_kegg( + "NAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.name] + ) + + def _definition(self): + return _write_kegg("DEFINITION", [self.definition]) + + def _dblinks(self): + s = [] + for entry in self.dblinks: + s.append(entry[0] + ": " + " ".join(entry[1])) + return _write_kegg("DBLINKS", [_wrap_kegg(l, wrap_rule=id_wrap(9)) for l in s]) + + +def parse(handle): + """Parse a KEGG Gene file, returning Record objects. + + This is an iterator function, typically used in a for loop. For + example, using one of the example KEGG files in the Biopython + test suite, + + >>> with open("KEGG/gene.sample") as handle: + ... for record in parse(handle): + ... print("%s %s" % (record.entry, record.name[0])) + ... + b1174 minE + b1175 minD + + + """ + record = Record() + for line in handle: + if line[:3] == "///": + yield record + record = Record() + continue + if line[:12] != " ": + keyword = line[:12] + data = line[12:].strip() + if keyword == "ENTRY ": + words = data.split() + record.entry = words[0] + elif keyword == "NAME ": + data = data.strip(";") + record.name.append(data) + elif keyword == "DEFINITION ": + record.definition = data + elif keyword == "ORTHOLOGY ": + id, name = data.split(" ") + orthology = (id, name) + record.orthology.append(orthology) + elif keyword == "ORGANISM ": + id, name = data.split(" ") + organism = (id, name) + record.organism = organism + elif keyword == "POSITION ": + record.position = data + elif keyword == "MOTIF ": + key, values = data.split(": ") + values = values.split() + row = (key, values) + record.motif.append(row) + elif keyword == "DBLINKS ": + if ":" in data: + key, values = data.split(": ") + values = values.split() + row = (key, values) + record.dblinks.append(row) + else: + row = record.dblinks[-1] + key, values = row + values.extend(data.split()) + row = key, values + record.dblinks[-1] = row + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/KEGG/Gene/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/Gene/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..efa0935 Binary files /dev/null and b/code/lib/Bio/KEGG/Gene/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/KEGG/KGML/KGML_parser.py b/code/lib/Bio/KEGG/KGML/KGML_parser.py new file mode 100644 index 0000000..6405ce3 --- /dev/null +++ b/code/lib/Bio/KEGG/KGML/KGML_parser.py @@ -0,0 +1,189 @@ +# Copyright 2013 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Classes and functions to parse a KGML pathway map. + +The KGML pathway map is parsed into the object structure defined in +KGML_Pathway.py in this module. + +Classes: + - KGMLParser - Parses KGML file + +Functions: + - read - Returns a single Pathway object, using KGMLParser internally + +""" + +from xml.etree import ElementTree + +from io import StringIO + +from Bio.KEGG.KGML.KGML_pathway import Component, Entry, Graphics +from Bio.KEGG.KGML.KGML_pathway import Pathway, Reaction, Relation + + +def read(handle): + """Parse a single KEGG Pathway from given file handle. + + Returns a single Pathway object. There should be one and only + one pathway in each file, but there may well be pathological + examples out there. + """ + pathways = parse(handle) + try: + pathway = next(pathways) + except StopIteration: + raise ValueError("No pathways found in handle") from None + try: + next(pathways) + raise ValueError("More than one pathway found in handle") + except StopIteration: + pass + return pathway + + +def parse(handle): + """Return an iterator over Pathway elements. + + Arguments: + - handle - file handle to a KGML file for parsing, or a KGML string + + This is a generator for the return of multiple Pathway objects. + + """ + # Check handle + try: + handle.read(0) + except AttributeError: + try: + handle = StringIO(handle) + except TypeError: + raise TypeError( + "An XML-containing handle or an XML string must be provided" + ) from None + # Parse XML and return each Pathway + for event, elem in ElementTree.iterparse(handle, events=("start", "end")): + if event == "end" and elem.tag == "pathway": + yield KGMLParser(elem).parse() + elem.clear() + + +class KGMLParser: + """Parses a KGML XML Pathway entry into a Pathway object. + + Example: Read and parse large metabolism file + + >>> from Bio.KEGG.KGML.KGML_parser import read + >>> pathway = read(open('KEGG/ko01100.xml', 'r')) + >>> print(len(pathway.entries)) + 3628 + >>> print(len(pathway.reactions)) + 1672 + >>> print(len(pathway.maps)) + 149 + + >>> pathway = read(open('KEGG/ko00010.xml', 'r')) + >>> print(pathway) #doctest: +NORMALIZE_WHITESPACE + Pathway: Glycolysis / Gluconeogenesis + KEGG ID: path:ko00010 + Image file: http://www.kegg.jp/kegg/pathway/ko/ko00010.png + Organism: ko + Entries: 99 + Entry types: + ortholog: 61 + compound: 31 + map: 7 + + """ + + def __init__(self, elem): + """Initialize the class.""" + self.entry = elem + + def parse(self): + """Parse the input elements.""" + + def _parse_pathway(attrib): + for k, v in attrib.items(): + self.pathway.__setattr__(k, v) + + def _parse_entry(element): + new_entry = Entry() + for k, v in element.attrib.items(): + new_entry.__setattr__(k, v) + for subelement in element: + if subelement.tag == "graphics": + _parse_graphics(subelement, new_entry) + elif subelement.tag == "component": + _parse_component(subelement, new_entry) + self.pathway.add_entry(new_entry) + + def _parse_graphics(element, entry): + new_graphics = Graphics(entry) + for k, v in element.attrib.items(): + new_graphics.__setattr__(k, v) + entry.add_graphics(new_graphics) + + def _parse_component(element, entry): + new_component = Component(entry) + for k, v in element.attrib.items(): + new_component.__setattr__(k, v) + entry.add_component(new_component) + + def _parse_reaction(element): + new_reaction = Reaction() + for k, v in element.attrib.items(): + new_reaction.__setattr__(k, v) + for subelement in element: + if subelement.tag == "substrate": + new_reaction.add_substrate(int(subelement.attrib["id"])) + elif subelement.tag == "product": + new_reaction.add_product(int(subelement.attrib["id"])) + self.pathway.add_reaction(new_reaction) + + def _parse_relation(element): + new_relation = Relation() + new_relation.entry1 = int(element.attrib["entry1"]) + new_relation.entry2 = int(element.attrib["entry2"]) + new_relation.type = element.attrib["type"] + for subtype in element: + name, value = subtype.attrib["name"], subtype.attrib["value"] + if name in ("compound", "hidden compound"): + new_relation.subtypes.append((name, int(value))) + else: + new_relation.subtypes.append((name, value)) + self.pathway.add_relation(new_relation) + + # ========== + # Initialize Pathway + self.pathway = Pathway() + # Get information about the pathway itself + _parse_pathway(self.entry.attrib) + for element in self.entry: + if element.tag == "entry": + _parse_entry(element) + elif element.tag == "reaction": + _parse_reaction(element) + elif element.tag == "relation": + _parse_relation(element) + # Parsing of some elements not implemented - no examples yet + else: + # This should warn us of any unimplemented tags + import warnings + from Bio import BiopythonParserWarning + + warnings.warn( + "Warning: tag %s not implemented in parser" % element.tag, + BiopythonParserWarning, + ) + return self.pathway + + +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest(verbose=0) diff --git a/code/lib/Bio/KEGG/KGML/KGML_pathway.py b/code/lib/Bio/KEGG/KGML/KGML_pathway.py new file mode 100644 index 0000000..12dd8aa --- /dev/null +++ b/code/lib/Bio/KEGG/KGML/KGML_pathway.py @@ -0,0 +1,859 @@ +# Copyright 2013 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Classes to represent a KGML Pathway Map. + +The KGML definition is as of release KGML v0.7.2 +(http://www.kegg.jp/kegg/xml/docs/) + +Classes: + - Pathway - Specifies graph information for the pathway map + - Relation - Specifies a relationship between two proteins or KOs, + or protein and compound. There is an implied direction to the + relationship in some cases. + - Reaction - A specific chemical reaction between a substrate and + a product. + - Entry - A node in the pathway graph + - Graphics - Entry subelement describing its visual representation + +""" + +import time +from itertools import chain +from xml.dom import minidom +import xml.etree.ElementTree as ET + + +# Pathway +class Pathway: + """Represents a KGML pathway from KEGG. + + Specifies graph information for the pathway map, as described in + release KGML v0.7.2 (http://www.kegg.jp/kegg/xml/docs/) + + Attributes: + - name - KEGGID of the pathway map + - org - ko/ec/[org prefix] + - number - map number (integer) + - title - the map title + - image - URL of the image map for the pathway + - link - URL of information about the pathway + - entries - Dictionary of entries in the pathway, keyed by node ID + - reactions - Set of reactions in the pathway + + The name attribute has a restricted format, so we make it a property and + enforce the formatting. + + The Pathway object is the only allowed route for adding/removing + Entry, Reaction, or Relation elements. + + Entries are held in a dictionary and keyed by the node ID for the + pathway graph - this allows for ready access via the Reaction/Relation + etc. elements. Entries must be added before reference by any other + element. + + Reactions are held in a dictionary, keyed by node ID for the path. + The elements referred to in the reaction must be added before the + reaction itself. + + """ + + def __init__(self): + """Initialize the class.""" + self._name = "" + self.org = "" + self._number = None + self.title = "" + self.image = "" + self.link = "" + self.entries = {} + self._reactions = {} + self._relations = set() + + def get_KGML(self): + """Return the pathway as a string in prettified KGML format.""" + header = "\n".join( + [ + '', + "', + "" % time.asctime(), + ] + ) + rough_xml = header + ET.tostring(self.element, "utf-8").decode() + reparsed = minidom.parseString(rough_xml) + return reparsed.toprettyxml(indent=" ") + + def add_entry(self, entry): + """Add an Entry element to the pathway.""" + # We insist that the node ID is an integer + if not isinstance(entry.id, int): + raise TypeError( + "Node ID must be an integer, got %s (%s)" % (type(entry.id), entry.id) + ) + entry._pathway = self # Let the entry know about the pathway + self.entries[entry.id] = entry + + def remove_entry(self, entry): + """Remove an Entry element from the pathway.""" + if not isinstance(entry.id, int): + raise TypeError( + "Node ID must be an integer, got %s (%s)" % (type(entry.id), entry.id) + ) + # We need to remove the entry from any other elements that may + # contain it, which means removing those elements + # TODO + del self.entries[entry.id] + + def add_reaction(self, reaction): + """Add a Reaction element to the pathway.""" + # We insist that the node ID is an integer and corresponds to an entry + if not isinstance(reaction.id, int): + raise ValueError( + "Node ID must be an integer, got %s (%s)" + % (type(reaction.id), reaction.id) + ) + if reaction.id not in self.entries: + raise ValueError("Reaction ID %d has no corresponding entry" % reaction.id) + reaction._pathway = self # Let the reaction know about the pathway + self._reactions[reaction.id] = reaction + + def remove_reaction(self, reaction): + """Remove a Reaction element from the pathway.""" + if not isinstance(reaction.id, int): + raise TypeError( + "Node ID must be an integer, got %s (%s)" + % (type(reaction.id), reaction.id) + ) + # We need to remove the reaction from any other elements that may + # contain it, which means removing those elements + # TODO + del self._reactions[reaction.id] + + def add_relation(self, relation): + """Add a Relation element to the pathway.""" + relation._pathway = self # Let the reaction know about the pathway + self._relations.add(relation) + + def remove_relation(self, relation): + """Remove a Relation element from the pathway.""" + self._relations.remove(relation) + + def __str__(self): + """Return a readable summary description string.""" + outstr = [ + "Pathway: %s" % self.title, + "KEGG ID: %s" % self.name, + "Image file: %s" % self.image, + "Organism: %s" % self.org, + "Entries: %d" % len(self.entries), + "Entry types:", + ] + for t in ["ortholog", "enzyme", "reaction", "gene", "group", "compound", "map"]: + etype = [e for e in self.entries.values() if e.type == t] + if len(etype): + outstr.append("\t%s: %d" % (t, len(etype))) + return "\n".join(outstr) + "\n" + + # Assert correct formatting of the pathway name, and other attributes + def _getname(self): + return self._name + + def _setname(self, value): + if not value.startswith("path:"): + raise ValueError("Pathway name should begin with 'path:', got %s" % value) + self._name = value + + def _delname(self): + del self._name + + name = property(_getname, _setname, _delname, "The KEGGID for the pathway map.") + + def _getnumber(self): + return self._number + + def _setnumber(self, value): + self._number = int(value) + + def _delnumber(self): + del self._number + + number = property(_getnumber, _setnumber, _delnumber, "The KEGG map number.") + + @property + def compounds(self): + """Get a list of entries of type compound.""" + return [e for e in self.entries.values() if e.type == "compound"] + + @property + def maps(self): + """Get a list of entries of type map.""" + return [e for e in self.entries.values() if e.type == "map"] + + @property + def orthologs(self): + """Get a list of entries of type ortholog.""" + return [e for e in self.entries.values() if e.type == "ortholog"] + + @property + def genes(self): + """Get a list of entries of type gene.""" + return [e for e in self.entries.values() if e.type == "gene"] + + @property + def reactions(self): + """Get a list of reactions in the pathway.""" + return self._reactions.values() + + @property + def reaction_entries(self): + """List of entries corresponding to each reaction in the pathway.""" + return [self.entries[i] for i in self._reactions] + + @property + def relations(self): + """Get a list of relations in the pathway.""" + return list(self._relations) + + @property + def element(self): + """Return the Pathway as a valid KGML element.""" + # The root is this Pathway element + pathway = ET.Element("pathway") + pathway.attrib = { + "name": self._name, + "org": self.org, + "number": str(self._number), + "title": self.title, + "image": self.image, + "link": self.link, + } + # We add the Entries in node ID order + for eid, entry in sorted(self.entries.items()): + pathway.append(entry.element) + # Next we add Relations + for relation in self._relations: + pathway.append(relation.element) + for eid, reaction in sorted(self._reactions.items()): + pathway.append(reaction.element) + return pathway + + @property + def bounds(self): + """Coordinate bounds for all Graphics elements in the Pathway. + + Returns the [(xmin, ymin), (xmax, ymax)] coordinates for all + Graphics elements in the Pathway + """ + xlist, ylist = [], [] + for b in [g.bounds for g in self.entries.values()]: + xlist.extend([b[0][0], b[1][0]]) + ylist.extend([b[0][1], b[1][1]]) + return [(min(xlist), min(ylist)), (max(xlist), max(ylist))] + + +# Entry +class Entry: + """Represent an Entry from KGML. + + Each Entry element is a node in the pathway graph, as described in + release KGML v0.7.2 (http://www.kegg.jp/kegg/xml/docs/) + + Attributes: + - id - The ID of the entry in the pathway map (integer) + - names - List of KEGG IDs for the entry + - type - The type of the entry + - link - URL of information about the entry + - reaction - List of KEGG IDs of the corresponding reactions + (integer) + - graphics - List of Graphics objects describing the Entry's visual + representation + - components - List of component node ID for this Entry ('group') + - alt - List of alternate names for the Entry + + NOTE: The alt attribute represents a subelement of the substrate and + product elements in the KGML file + + """ + + def __init__(self): + """Initialize the class.""" + self._id = None + self._names = [] + self.type = "" + self.image = "" + self.link = "" + self.graphics = [] + self.components = set() + self.alt = [] + self._pathway = None + self._reactions = [] + + def __str__(self): + """Return readable descriptive string.""" + outstr = [ + "Entry node ID: %d" % self.id, + "Names: %s" % self.name, + "Type: %s" % self.type, + "Components: %s" % self.components, + "Reactions: %s" % self.reaction, + "Graphics elements: %d %s" % (len(self.graphics), self.graphics), + ] + return "\n".join(outstr) + "\n" + + def add_component(self, element): + """Add an element to the entry. + + If the Entry is already part of a pathway, make sure + the component already exists. + """ + if self._pathway is not None: + if element.id not in self._pathway.entries: + raise ValueError( + "Component %s is not an entry in the pathway" % element.id + ) + self.components.add(element) + + def remove_component(self, value): + """Remove the entry with the passed ID from the group.""" + self.components.remove(value) + + def add_graphics(self, entry): + """Add the Graphics entry.""" + self.graphics.append(entry) + + def remove_graphics(self, entry): + """Remove the Graphics entry with the passed ID from the group.""" + self.graphics.remove(entry) + + # Names may be given as a space-separated list of KEGG identifiers + def _getname(self): + return " ".join(self._names) + + def _setname(self, value): + self._names = value.split() + + def _delname(self): + self._names = [] + + name = property( + _getname, _setname, _delname, "List of KEGG identifiers for the Entry." + ) + + # Reactions may be given as a space-separated list of KEGG identifiers + def _getreaction(self): + return " ".join(self._reactions) + + def _setreaction(self, value): + self._reactions = value.split() + + def _delreaction(self): + self._reactions = [] + + reaction = property( + _getreaction, + _setreaction, + _delreaction, + "List of reaction KEGG IDs for this Entry.", + ) + + # We make sure that the node ID is an integer + def _getid(self): + return self._id + + def _setid(self, value): + self._id = int(value) + + def _delid(self): + del self._id + + id = property(_getid, _setid, _delid, "The pathway graph node ID for the Entry.") + + @property + def element(self): + """Return the Entry as a valid KGML element.""" + # The root is this Entry element + entry = ET.Element("entry") + entry.attrib = { + "id": str(self._id), + "name": self.name, + "link": self.link, + "type": self.type, + } + if len(self._reactions): + entry.attrib["reaction"] = self.reaction + if len(self.graphics): + for g in self.graphics: + entry.append(g.element) + if len(self.components): + for c in self.components: + entry.append(c.element) + return entry + + @property + def bounds(self): + """Coordinate bounds for all Graphics elements in the Entry. + + Return the [(xmin, ymin), (xmax, ymax)] co-ordinates for the Entry + Graphics elements. + """ + xlist, ylist = [], [] + for b in [g.bounds for g in self.graphics]: + xlist.extend([b[0][0], b[1][0]]) + ylist.extend([b[0][1], b[1][1]]) + return [(min(xlist), min(ylist)), (max(xlist), max(ylist))] + + @property + def is_reactant(self): + """Return true if this Entry participates in any reaction in its parent pathway.""" + for rxn in self._pathway.reactions: + if self._id in rxn.reactant_ids: + return True + return False + + +# Component +class Component: + """An Entry subelement used to represents a complex node. + + A subelement of the Entry element, used when the Entry is a complex + node, as described in release KGML v0.7.2 + (http://www.kegg.jp/kegg/xml/docs/) + + The Component acts as a collection (with type 'group', and typically + its own Graphics subelement), having only an ID. + """ + + def __init__(self, parent): + """Initialize the class.""" + self._id = None + self._parent = parent + + # We make sure that the node ID is an integer + def _getid(self): + return self._id + + def _setid(self, value): + self._id = int(value) + + def _delid(self): + del self._id + + id = property(_getid, _setid, _delid, "The pathway graph node ID for the Entry") + + @property + def element(self): + """Return the Component as a valid KGML element.""" + # The root is this Component element + component = ET.Element("component") + component.attrib = {"id": str(self._id)} + return component + + +# Graphics +class Graphics: + """An Entry subelement used to represents the visual representation. + + A subelement of Entry, specifying its visual representation, as + described in release KGML v0.7.2 (http://www.kegg.jp/kegg/xml/docs/) + + Attributes: + - name Label for the graphics object + - x X-axis position of the object (int) + - y Y-axis position of the object (int) + - coords polyline co-ordinates, list of (int, int) tuples + - type object shape + - width object width (int) + - height object height (int) + - fgcolor object foreground color (hex RGB) + - bgcolor object background color (hex RGB) + + Some attributes are present only for specific graphics types. For + example, line types do not (typically) have a width. + We permit non-DTD attributes and attribute settings, such as + + dash List of ints, describing an on/off pattern for dashes + + """ + + def __init__(self, parent): + """Initialize the class.""" + self.name = "" + self._x = None + self._y = None + self._coords = None + self.type = "" + self._width = None + self._height = None + self.fgcolor = "" + self.bgcolor = "" + self._parent = parent + + # We make sure that the XY coordinates, width and height are numbers + def _getx(self): + return self._x + + def _setx(self, value): + self._x = float(value) + + def _delx(self): + del self._x + + x = property(_getx, _setx, _delx, "The X coordinate for the graphics element.") + + def _gety(self): + return self._y + + def _sety(self, value): + self._y = float(value) + + def _dely(self): + del self._y + + y = property(_gety, _sety, _dely, "The Y coordinate for the graphics element.") + + def _getwidth(self): + return self._width + + def _setwidth(self, value): + self._width = float(value) + + def _delwidth(self): + del self._width + + width = property( + _getwidth, _setwidth, _delwidth, "The width of the graphics element." + ) + + def _getheight(self): + return self._height + + def _setheight(self, value): + self._height = float(value) + + def _delheight(self): + del self._height + + height = property( + _getheight, _setheight, _delheight, "The height of the graphics element." + ) + + # We make sure that the polyline co-ordinates are integers, too + def _getcoords(self): + return self._coords + + def _setcoords(self, value): + clist = [int(e) for e in value.split(",")] + self._coords = [tuple(clist[i : i + 2]) for i in range(0, len(clist), 2)] + + def _delcoords(self): + del self._coords + + coords = property( + _getcoords, + _setcoords, + _delcoords, + "Polyline coordinates for the graphics element.", + ) + + # Set default colors + def _getfgcolor(self): + return self._fgcolor + + def _setfgcolor(self, value): + if value == "none": + self._fgcolor = "#000000" # this default defined in KGML spec + else: + self._fgcolor = value + + def _delfgcolor(self): + del self._fgcolor + + fgcolor = property(_getfgcolor, _setfgcolor, _delfgcolor, "Foreground color.") + + def _getbgcolor(self): + return self._bgcolor + + def _setbgcolor(self, value): + if value == "none": + self._bgcolor = "#000000" # this default defined in KGML spec + else: + self._bgcolor = value + + def _delbgcolor(self): + del self._bgcolor + + bgcolor = property(_getbgcolor, _setbgcolor, _delbgcolor, "Background color.") + + @property + def element(self): + """Return the Graphics as a valid KGML element.""" + # The root is this Component element + graphics = ET.Element("graphics") + if isinstance(self.fgcolor, str): # Assumes that string is hexstring + fghex = self.fgcolor + else: # Assumes ReportLab Color object + fghex = "#" + self.fgcolor.hexval()[2:] + if isinstance(self.bgcolor, str): # Assumes that string is hexstring + bghex = self.bgcolor + else: # Assumes ReportLab Color object + bghex = "#" + self.bgcolor.hexval()[2:] + graphics.attrib = { + "name": self.name, + "type": self.type, + "fgcolor": fghex, + "bgcolor": bghex, + } + for (n, attr) in [ + ("x", "_x"), + ("y", "_y"), + ("width", "_width"), + ("height", "_height"), + ]: + if getattr(self, attr) is not None: + graphics.attrib[n] = str(getattr(self, attr)) + if self.type == "line": # Need to write polycoords + graphics.attrib["coords"] = ",".join( + [str(e) for e in chain.from_iterable(self.coords)] + ) + return graphics + + @property + def bounds(self): + """Coordinate bounds for the Graphics element. + + Return the bounds of the Graphics object as an [(xmin, ymin), + (xmax, ymax)] tuple. Co-ordinates give the centre of the + circle, rectangle, roundrectangle elements, so we have to + adjust for the relevant width/height. + """ + if self.type == "line": + xlist = [x for x, y in self.coords] + ylist = [y for x, y in self.coords] + return [(min(xlist), min(ylist)), (max(xlist), max(ylist))] + else: + return [ + (self.x - self.width * 0.5, self.y - self.height * 0.5), + (self.x + self.width * 0.5, self.y + self.height * 0.5), + ] + + @property + def centre(self): + """Return the centre of the Graphics object as an (x, y) tuple.""" + return ( + 0.5 * (self.bounds[0][0] + self.bounds[1][0]), + 0.5 * (self.bounds[0][1] + self.bounds[1][1]), + ) + + +# Reaction +class Reaction: + """A specific chemical reaction with substrates and products. + + This describes a specific chemical reaction between one or more + substrates and one or more products. + + Attributes: + - id Pathway graph node ID of the entry + - names List of KEGG identifier(s) from the REACTION database + - type String: reversible or irreversible + - substrate Entry object of the substrate + - product Entry object of the product + + """ + + def __init__(self): + """Initialize the class.""" + self._id = None + self._names = [] + self.type = "" + self._substrates = set() + self._products = set() + self._pathway = None + + def __str__(self): + """Return an informative human-readable string.""" + outstr = [ + "Reaction node ID: %s" % self.id, + "Reaction KEGG IDs: %s" % self.name, + "Type: %s" % self.type, + "Substrates: %s" % ",".join([s.name for s in self.substrates]), + "Products: %s" % ",".join([s.name for s in self.products]), + ] + return "\n".join(outstr) + "\n" + + def add_substrate(self, substrate_id): + """Add a substrate, identified by its node ID, to the reaction.""" + if self._pathway is not None: + if int(substrate_id) not in self._pathway.entries: + raise ValueError( + "Couldn't add substrate, no node ID %d in Pathway" + % int(substrate_id) + ) + self._substrates.add(substrate_id) + + def add_product(self, product_id): + """Add a product, identified by its node ID, to the reaction.""" + if self._pathway is not None: + if int(product_id) not in self._pathway.entries: + raise ValueError( + "Couldn't add product, no node ID %d in Pathway" % product_id + ) + self._products.add(int(product_id)) + + # The node ID is also the node ID of the Entry that corresponds to the + # reaction; we get the corresponding Entry when there is an associated + # Pathway + def _getid(self): + return self._id + + def _setid(self, value): + self._id = int(value) + + def _delid(self): + del self._id + + id = property(_getid, _setid, _delid, "Node ID for the reaction.") + + # Names may show up as a space-separated list of several KEGG identifiers + def _getnames(self): + return " ".join(self._names) + + def _setnames(self, value): + self._names.extend(value.split()) + + def _delnames(self): + del self.names + + name = property( + _getnames, _setnames, _delnames, "List of KEGG identifiers for the reaction." + ) + + # products and substrates are read-only properties, returning lists + # of Entry objects + @property + def substrates(self): + """Return list of substrate Entry elements.""" + return [self._pathway.entries[sid] for sid in self._substrates] + + @property + def products(self): + """Return list of product Entry elements.""" + return [self._pathway.entries[pid] for pid in self._products] + + @property + def entry(self): + """Return the Entry corresponding to this reaction.""" + return self._pathway.entries[self._id] + + @property + def reactant_ids(self): + """Return a list of substrate and product reactant IDs.""" + return self._products.union(self._substrates) + + @property + def element(self): + """Return KGML element describing the Reaction.""" + # The root is this Relation element + reaction = ET.Element("reaction") + reaction.attrib = {"id": str(self.id), "name": self.name, "type": self.type} + for s in self._substrates: + substrate = ET.Element("substrate") + substrate.attrib["id"] = str(s) + substrate.attrib["name"] = self._pathway.entries[s].name + reaction.append(substrate) + for p in self._products: + product = ET.Element("product") + product.attrib["id"] = str(p) + product.attrib["name"] = self._pathway.entries[p].name + reaction.append(product) + return reaction + + +# Relation +class Relation: + """A relationship between to products, KOs, or protein and compound. + + This describes a relationship between two products, KOs, or protein + and compound, as described in release KGML v0.7.2 + (http://www.kegg.jp/kegg/xml/docs/) + + Attributes: + - entry1 - The first Entry object node ID defining the + relation (int) + - entry2 - The second Entry object node ID defining the + relation (int) + - type - The relation type + - subtypes - List of subtypes for the relation, as a list of + (name, value) tuples + + """ + + def __init__(self): + """Initialize the class.""" + self._entry1 = None + self._entry2 = None + self.type = "" + self.subtypes = [] + self._pathway = None + + def __str__(self): + """Return a useful human-readable string.""" + outstr = [ + "Relation (subtypes: %d):" % len(self.subtypes), + "Entry1:", + str(self.entry1), + "Entry2:", + str(self.entry2), + ] + for s in self.subtypes: + outstr.extend(["Subtype: %s" % s[0], str(s[1])]) + return "\n".join(outstr) + + # Properties entry1 and entry2 + def _getentry1(self): + if self._pathway is not None: + return self._pathway.entries[self._entry1] + return self._entry1 + + def _setentry1(self, value): + self._entry1 = int(value) + + def _delentry1(self): + del self._entry1 + + entry1 = property(_getentry1, _setentry1, _delentry1, "Entry1 of the relation.") + + def _getentry2(self): + if self._pathway is not None: + return self._pathway.entries[self._entry2] + return self._entry2 + + def _setentry2(self, value): + self._entry2 = int(value) + + def _delentry2(self): + del self._entry2 + + entry2 = property(_getentry2, _setentry2, _delentry2, "Entry2 of the relation.") + + @property + def element(self): + """Return KGML element describing the Relation.""" + # The root is this Relation element + relation = ET.Element("relation") + relation.attrib = { + "entry1": str(self._entry1), + "entry2": str(self._entry2), + "type": self.type, + } + for (name, value) in self.subtypes: + subtype = ET.Element("subtype") + subtype.attrib = {"name": name, "value": str(value)} + relation.append(subtype) + return relation diff --git a/code/lib/Bio/KEGG/KGML/__init__.py b/code/lib/Bio/KEGG/KGML/__init__.py new file mode 100644 index 0000000..9063911 --- /dev/null +++ b/code/lib/Bio/KEGG/KGML/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2013 by Leighton Pritchard. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Code to work with data from the KEGG database. + +References: +Kanehisa, M. and Goto, S.; KEGG: Kyoto Encyclopedia of Genes and Genomes. +Nucleic Acids Res. 28, 29-34 (2000). + +URL: http://www.genome.ad.jp/kegg/ + +""" diff --git a/code/lib/Bio/KEGG/KGML/__pycache__/KGML_parser.cpython-37.pyc b/code/lib/Bio/KEGG/KGML/__pycache__/KGML_parser.cpython-37.pyc new file mode 100644 index 0000000..9ed45a0 Binary files /dev/null and b/code/lib/Bio/KEGG/KGML/__pycache__/KGML_parser.cpython-37.pyc differ diff --git a/code/lib/Bio/KEGG/KGML/__pycache__/KGML_pathway.cpython-37.pyc b/code/lib/Bio/KEGG/KGML/__pycache__/KGML_pathway.cpython-37.pyc new file mode 100644 index 0000000..687a3d2 Binary files /dev/null and b/code/lib/Bio/KEGG/KGML/__pycache__/KGML_pathway.cpython-37.pyc differ diff --git a/code/lib/Bio/KEGG/KGML/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/KGML/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..236575f Binary files /dev/null and b/code/lib/Bio/KEGG/KGML/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/KEGG/Map/__init__.py b/code/lib/Bio/KEGG/Map/__init__.py new file mode 100644 index 0000000..e1b37f7 --- /dev/null +++ b/code/lib/Bio/KEGG/Map/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2001 by Tarjei Mikkelsen. All rights reserved. +# Copyright 2007 by Michiel de Hoon. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Load KEGG Pathway maps for use with the Biopython Pathway module. + +The pathway maps are in the format:: + + RXXXXX:[X.X.X.X:] A + 2 B <=> C + RXXXXX:[X.X.X.X:] 3C <=> 2 D + E + ... + +where RXXXXX is a five-digit reaction id, and X.X.X.X is the optional +EC number of the enzyme that catalyze the reaction. +""" + +from Bio.Pathway import Reaction + + +def parse(handle): + """Parse a KEGG pathway map.""" + for line in handle: + data, catalysts, reaction = line.split(":") + catalysts = [(catalysts,)] + reactants = {} + before, after = reaction.split("<=>") + compounds = before.split(" + ") + for compound in compounds: + compound = compound.strip() + try: + number, compound = compound.split() + number = -int(number) + except ValueError: + number = -1 + reactants[compound] = number + compounds = after.split(" + ") + for compound in compounds: + compound = compound.strip() + try: + number, compound = compound.split() + number = int(number) + except ValueError: + number = +1 + reactants[compound] = number + yield Reaction(reactants, catalysts, True, data) diff --git a/code/lib/Bio/KEGG/Map/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/Map/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..c220247 Binary files /dev/null and b/code/lib/Bio/KEGG/Map/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/KEGG/REST.py b/code/lib/Bio/KEGG/REST.py new file mode 100644 index 0000000..11f9f98 --- /dev/null +++ b/code/lib/Bio/KEGG/REST.py @@ -0,0 +1,315 @@ +# Copyright 2014 by Kevin Wu. +# Revisions copyright 2014 by Peter Cock. +# All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Provides code to access the REST-style KEGG online API. + +This module aims to make the KEGG online REST-style API easier to use. See: +http://www.kegg.jp/kegg/rest/keggapi.html + +The KEGG REST-style API provides simple access to a range of KEGG databases. +This works using simple URLs (which this module will construct for you), +with any errors indicated via HTTP error levels. + +The functionality is somewhat similar to Biopython's Bio.TogoWS and Bio.Entrez +modules. + +Currently KEGG does not provide any usage guidelines (unlike the NCBI whose +requirements are reasonably clear). To avoid risking overloading the service, +Biopython will only allow three calls per second. + +References: +Kanehisa, M. and Goto, S.; KEGG: Kyoto Encyclopedia of Genes and Genomes. +Nucleic Acids Res. 28, 29-34 (2000). + +""" + +import io +from urllib.request import urlopen + + +def _q(op, arg1, arg2=None, arg3=None): + URL = "http://rest.kegg.jp/%s" + if arg2 and arg3: + args = "%s/%s/%s/%s" % (op, arg1, arg2, arg3) + elif arg2: + args = "%s/%s/%s" % (op, arg1, arg2) + else: + args = "%s/%s" % (op, arg1) + resp = urlopen(URL % (args)) + + if "image" == arg2: + return resp + + handle = io.TextIOWrapper(resp, encoding="UTF-8") + handle.url = resp.url + return handle + + +# http://www.kegg.jp/kegg/rest/keggapi.html +def kegg_info(database): + """KEGG info - Displays the current statistics of a given database. + + db - database or organism (string) + + The argument db can be a KEGG database name (e.g. 'pathway' or its + official abbreviation, 'path'), or a KEGG organism code or T number + (e.g. 'hsa' or 'T01001' for human). + + A valid list of organism codes and their T numbers can be obtained + via kegg_info('organism') or http://rest.kegg.jp/list/organism + + """ + # TODO - return a string (rather than the handle?) + # TODO - chache and validate the organism code / T numbers? + # TODO - can we parse the somewhat formatted output? + # + # http://rest.kegg.jp/info/ + # + # = pathway | brite | module | disease | drug | environ | + # ko | genome | | compound | glycan | reaction | + # rpair | rclass | enzyme | genomes | genes | ligand | kegg + # = KEGG organism code or T number + return _q("info", database) + + +def kegg_list(database, org=None): + """KEGG list - Entry list for database, or specified database entries. + + db - database or organism (string) + org - optional organism (string), see below. + + For the pathway and module databases the optional organism can be + used to restrict the results. + + """ + # TODO - split into two functions (dbentries seems separate)? + # + # http://rest.kegg.jp/list// + # + # = pathway | module + # = KEGG organism code + if database in ("pathway", "module") and org: + resp = _q("list", database, org) + elif isinstance(database, str) and database and org: + raise ValueError("Invalid database arg for kegg list request.") + + # http://rest.kegg.jp/list/ + # + # = pathway | brite | module | disease | drug | environ | + # ko | genome | | compound | glycan | reaction | + # rpair | rclass | enzyme | organism + # = KEGG organism code or T number + # + # + # http://rest.kegg.jp/list/ + # + # = KEGG database entries involving the following + # = pathway | brite | module | disease | drug | environ | + # ko | genome | | compound | glycan | reaction | + # rpair | rclass | enzyme + # = KEGG organism code or T number + else: + if isinstance(database, list): + if len(database) > 100: + raise ValueError( + "Maximum number of databases is 100 for kegg list query" + ) + database = ("+").join(database) + resp = _q("list", database) + + return resp + + +def kegg_find(database, query, option=None): + """KEGG find - Data search. + + Finds entries with matching query keywords or other query data in + a given database. + + db - database or organism (string) + query - search terms (string) + option - search option (string), see below. + + For the compound and drug database, set option to the string 'formula', + 'exact_mass' or 'mol_weight' to search on that field only. The + chemical formula search is a partial match irrespective of the order + of atoms given. The exact mass (or molecular weight) is checked by + rounding off to the same decimal place as the query data. A range of + values may also be specified with the minus(-) sign. + + """ + # TODO - return list of tuples? + # + # http://rest.kegg.jp/find///") + counter = 0 + + while True: + start_offset = handle.tell() + line = handle.readline() + if not line: + break + if qstart_mark not in line: + continue + # The following requirements are to make supporting BGZF compressed + # BLAST XML files simpler (avoids complex offset manipulations): + assert line.count(qstart_mark) == 1, "XML without line breaks?" + assert line.lstrip().startswith(qstart_mark), line + if qend_mark in line: + # Should cope with ... on one long line + block = line + else: + # Load the rest of this block up to and including + block = [line] + while line and qend_mark not in line: + line = handle.readline() + assert qstart_mark not in line, line + block.append(line) + assert line.rstrip().endswith(qend_mark), line + block = b"".join(block) + assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block + assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block + # Now we have a full ... block, find the ID + regx = re.search(re_desc, block) + try: + qstart_desc = regx.group(2) + qstart_id = regx.group(1) + except AttributeError: + # use the fallback values + assert re.search(re_desc_end, block) + qstart_desc = self._fallback["description"].encode() + qstart_id = self._fallback["id"].encode() + if qstart_id.startswith(blast_id_mark): + qstart_id = qstart_desc.split(b" ", 1)[0] + yield qstart_id.decode(), start_offset, len(block) + counter += 1 + + def _parse(self, handle): + """Overwrite SearchIndexer parse (PRIVATE). + + As we need to set the meta and fallback dictionaries to the parser. + """ + generator = self._parser(handle, **self._kwargs) + generator._meta = self._meta + generator._fallback = self._fallback + return next(iter(generator)) + + def get_raw(self, offset): + """Return the raw record from the file as a bytes string.""" + qend_mark = self.qend_mark + handle = self._handle + handle.seek(offset) + + qresult_raw = handle.readline() + assert qresult_raw.lstrip().startswith(self.qstart_mark) + while qend_mark not in qresult_raw: + qresult_raw += handle.readline() + assert qresult_raw.rstrip().endswith(qend_mark) + assert qresult_raw.count(qend_mark) == 1 + # Note this will include any leading and trailing whitespace, in + # general expecting " \n...\n \n" + return qresult_raw + + +class _BlastXmlGenerator(XMLGenerator): + """Event-based XML Generator.""" + + def __init__(self, out, encoding="utf-8", indent=" ", increment=2): + """Initialize the class.""" + XMLGenerator.__init__(self, out, encoding) + # the indentation character + self._indent = indent + # nest level + self._level = 0 + # how many indentation character should we increment per level + self._increment = increment + # container for names of tags with children + self._parent_stack = [] + # determine writer method + + def startDocument(self): + """Start the XML document.""" + self._write( + '\n' + '\n' + ) + + def startElement(self, name, attrs=None, children=False): + """Start an XML element. + + :param name: element name + :type name: string + :param attrs: element attributes + :type attrs: dictionary {string: object} + :param children: whether the element has children or not + :type children: bool + + """ + if attrs is None: + attrs = {} + self.ignorableWhitespace(self._indent * self._level) + XMLGenerator.startElement(self, name, attrs) + + def endElement(self, name): + """End and XML element of the given name.""" + XMLGenerator.endElement(self, name) + self._write("\n") + + def startParent(self, name, attrs=None): + """Start an XML element which has children. + + :param name: element name + :type name: string + :param attrs: element attributes + :type attrs: dictionary {string: object} + + """ + if attrs is None: + attrs = {} + self.startElement(name, attrs, children=True) + self._level += self._increment + self._write("\n") + # append the element name, so we can end it later + self._parent_stack.append(name) + + def endParent(self): + """End an XML element with children.""" + # the element to end is the one on top of the stack + name = self._parent_stack.pop() + self._level -= self._increment + self.ignorableWhitespace(self._indent * self._level) + self.endElement(name) + + def startParents(self, *names): + """Start XML elements without children.""" + for name in names: + self.startParent(name) + + def endParents(self, num): + """End XML elements, according to the given number.""" + for i in range(num): + self.endParent() + + def simpleElement(self, name, content=None): + """Create an XML element without children with the given content.""" + self.startElement(name, attrs={}) + if content: + self.characters(content) + self.endElement(name) + + def characters(self, content): + """Replace quotes and apostrophe.""" + content = escape(str(content)) + for a, b in (('"', """), ("'", "'")): + content = content.replace(a, b) + self._write(content) + + +class BlastXmlWriter: + """Stream-based BLAST+ XML Writer.""" + + def __init__(self, handle, use_raw_query_ids=True, use_raw_hit_ids=True): + """Initialize the class.""" + self.xml = _BlastXmlGenerator(handle, "utf-8") + self._use_raw_query_ids = use_raw_query_ids + self._use_raw_hit_ids = use_raw_hit_ids + + def write_file(self, qresults): + """Write the XML contents to the output handle.""" + xml = self.xml + self.qresult_counter, self.hit_counter, self.hsp_counter, self.frag_counter = ( + 0, + 0, + 0, + 0, + ) + + # get the first qresult, since the preamble requires its attr values + first_qresult = next(qresults) + # start the XML document, set the root element, and create the preamble + xml.startDocument() + xml.startParent("BlastOutput") + self._write_preamble(first_qresult) + # and write the qresults + xml.startParent("BlastOutput_iterations") + self._write_qresults(chain([first_qresult], qresults)) + xml.endParents(2) + xml.endDocument() + + return ( + self.qresult_counter, + self.hit_counter, + self.hsp_counter, + self.frag_counter, + ) + + def _write_elem_block(self, block_name, map_name, obj, opt_dict=None): + """Write sibling XML elements (PRIVATE). + + :param block_name: common element name prefix + :type block_name: string + :param map_name: name of mapping between element and attribute names + :type map_name: string + :param obj: object whose attribute value will be used + :type obj: object + :param opt_dict: custom element-attribute mapping + :type opt_dict: dictionary {string: string} + + """ + if opt_dict is None: + opt_dict = {} + for elem, attr in _WRITE_MAPS[map_name]: + elem = block_name + elem + try: + content = str(getattr(obj, attr)) + except AttributeError: + # ensure attrs that is not present is optional + if elem not in _DTD_OPT: + raise ValueError( + "Element %r (attribute %r) not found" % (elem, attr) + ) + else: + # custom element-attribute mapping, for fallback values + if elem in opt_dict: + content = opt_dict[elem] + self.xml.simpleElement(elem, content) + + def _write_preamble(self, qresult): + """Write the XML file preamble (PRIVATE).""" + xml = self.xml + + for elem, attr in _WRITE_MAPS["preamble"]: + elem = "BlastOutput_" + elem + if elem == "BlastOutput_param": + xml.startParent(elem) + self._write_param(qresult) + xml.endParent() + continue + try: + content = str(getattr(qresult, attr)) + except AttributeError: + if elem not in _DTD_OPT: + raise ValueError( + "Element %s (attribute %s) not found" % (elem, attr) + ) + else: + if elem == "BlastOutput_version": + content = "%s %s" % (qresult.program.upper(), qresult.version) + elif qresult.blast_id: + if elem == "BlastOutput_query-ID": + content = qresult.blast_id + elif elem == "BlastOutput_query-def": + content = " ".join([qresult.id, qresult.description]).strip() + xml.simpleElement(elem, content) + + def _write_param(self, qresult): + """Write the parameter block of the preamble (PRIVATE).""" + xml = self.xml + xml.startParent("Parameters") + self._write_elem_block("Parameters_", "param", qresult) + xml.endParent() + + def _write_qresults(self, qresults): + """Write QueryResult objects into iteration elements (PRIVATE).""" + xml = self.xml + + for num, qresult in enumerate(qresults): + xml.startParent("Iteration") + xml.simpleElement("Iteration_iter-num", str(num + 1)) + opt_dict = {} + if self._use_raw_query_ids: + query_id = qresult.blast_id + query_desc = qresult.id + " " + qresult.description + else: + query_id = qresult.id + query_desc = qresult.description + + opt_dict = { + "Iteration_query-ID": query_id, + "Iteration_query-def": query_desc, + } + self._write_elem_block("Iteration_", "qresult", qresult, opt_dict) + # the Iteration_hits tag only has children if there are hits + if qresult: + xml.startParent("Iteration_hits") + self._write_hits(qresult.hits) + xml.endParent() + # otherwise it's a simple element without any contents + else: + xml.simpleElement("Iteration_hits", "") + + xml.startParents("Iteration_stat", "Statistics") + self._write_elem_block("Statistics_", "stat", qresult) + xml.endParents(2) + # there's a message if no hits is present + if not qresult: + xml.simpleElement("Iteration_message", "No hits found") + self.qresult_counter += 1 + xml.endParent() + + def _write_hits(self, hits): + """Write Hit objects (PRIVATE).""" + xml = self.xml + + for num, hit in enumerate(hits): + xml.startParent("Hit") + xml.simpleElement("Hit_num", str(num + 1)) + # use custom hit_id and hit_def mapping if the hit has a + # BLAST-generated ID + opt_dict = {} + + if self._use_raw_hit_ids: + hit_id = hit.blast_id + hit_desc = " >".join( + [f"{x} {y}" for x, y in zip(hit.id_all, hit.description_all)] + ) + else: + hit_id = hit.id + hit_desc = hit.description + " >".join( + [ + f"{x} {y}" + for x, y in zip(hit.id_all[1:], hit.description_all[1:]) + ] + ) + + opt_dict = {"Hit_id": hit_id, "Hit_def": hit_desc} + self._write_elem_block("Hit_", "hit", hit, opt_dict) + xml.startParent("Hit_hsps") + self._write_hsps(hit.hsps) + self.hit_counter += 1 + xml.endParents(2) + + def _write_hsps(self, hsps): + """Write HSP objects (PRIVATE).""" + xml = self.xml + for num, hsp in enumerate(hsps): + xml.startParent("Hsp") + xml.simpleElement("Hsp_num", str(num + 1)) + for elem, attr in _WRITE_MAPS["hsp"]: + elem = "Hsp_" + elem + try: + content = self._adjust_output(hsp, elem, attr) + # make sure any elements that is not present is optional + # in the DTD + except AttributeError: + if elem not in _DTD_OPT: + raise ValueError( + "Element %s (attribute %s) not found" % (elem, attr) + ) + else: + xml.simpleElement(elem, str(content)) + self.hsp_counter += 1 + self.frag_counter += len(hsp.fragments) + xml.endParent() + + def _adjust_output(self, hsp, elem, attr): + """Adjust output to mimic native BLAST+ XML as much as possible (PRIVATE).""" + # adjust coordinates + if attr in ( + "query_start", + "query_end", + "hit_start", + "hit_end", + "pattern_start", + "pattern_end", + ): + content = getattr(hsp, attr) + 1 + if "_start" in attr: + content = getattr(hsp, attr) + 1 + else: + content = getattr(hsp, attr) + + # adjust for 'from' <--> 'to' flip if it's not a translated search + # and frames are different + # adapted from /src/algo/blast/format/blastxml_format.cpp#L216 + if hsp.query_frame != 0 and hsp.hit_frame < 0: + if attr == "hit_start": + content = getattr(hsp, "hit_end") + elif attr == "hit_end": + content = getattr(hsp, "hit_start") + 1 + + # for seqrecord objects, we only need the sequence string + elif elem in ("Hsp_hseq", "Hsp_qseq"): + content = str(getattr(hsp, attr).seq) + elif elem == "Hsp_midline": + content = hsp.aln_annotation["similarity"] + elif elem in ("Hsp_evalue", "Hsp_bit-score"): + # adapted from src/algo/blast/format/blastxml_format.cpp#L138-140 + content = "%.*g" % (6, getattr(hsp, attr)) + else: + content = getattr(hsp, attr) + + return content + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/BlatIO.py b/code/lib/Bio/SearchIO/BlatIO.py new file mode 100644 index 0000000..fde64dd --- /dev/null +++ b/code/lib/Bio/SearchIO/BlatIO.py @@ -0,0 +1,751 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for BLAT output formats. + +This module adds support for parsing BLAT outputs. BLAT (BLAST-Like Alignment +Tool) is a sequence similarity search program initially built for annotating +the human genome. + +Bio.SearchIO.BlastIO was tested using standalone BLAT version 34, psLayout +version 3. It should be able to parse psLayout version 4 without problems. + +More information on BLAT is available from these sites: + + - Publication: http://genome.cshlp.org/content/12/4/656 + - User guide: http://genome.ucsc.edu/goldenPath/help/blatSpec.html + - Source download: http://www.soe.ucsc.edu/~kent/src + - Executable download: http://hgdownload.cse.ucsc.edu/admin/exe/ + - Blat score calculation: http://genome.ucsc.edu/FAQ/FAQblat.html#blat4 + + +Supported Formats +================= + +BlatIO supports parsing, indexing, and writing for both PSL and PSLX output +formats, with or without header. To parse, index, or write PSLX files, use the +'pslx' keyword argument and set it to True. + + # blat-psl defaults to PSL files + >>> from Bio import SearchIO + >>> psl = 'Blat/psl_34_004.psl' + >>> qresult = SearchIO.read(psl, 'blat-psl') + >>> qresult + QueryResult(id='hg19_dna', 10 hits) + + # set the pslx flag to parse PSLX files + >>> pslx = 'Blat/pslx_34_004.pslx' + >>> qresult = SearchIO.read(pslx, 'blat-psl', pslx=True) + >>> qresult + QueryResult(id='hg19_dna', 10 hits) + +For parsing and indexing, you do not need to specify whether the file has a +header or not. For writing, if you want to write a header, you can set the +'header' keyword argument to True. This will write a 'psLayout version 3' header +to your output file. + + from Bio import SearchIO + qresult = SearchIO.read(psl, 'blat-psl') + SearchIO.write(qresult, 'header.psl', header=True) + (1, 10, 19, 23) + +Note that the number of HSPFragments written may exceed the number of HSP +objects. This is because in PSL files, it is possible to have single matches +consisting of noncontiguous sequence fragments. This is where the HSPFragment +object comes into play. These fragments are grouped into a single HSP because +they share the same statistics (e.g. match numbers, BLAT score, etc.). However, +they do not share the same sequence attributes, such as the start and end +coordinates, making them distinct objects. + +In addition to parsing PSL(X) files, BlatIO also computes the percent identities +and scores of your search results. This is done using the calculation formula +posted here: http://genome.ucsc.edu/FAQ/FAQblat.html#blat4. It mimics the score +and percent identity calculation done by UCSC's web BLAT service. + +Since BlatIO parses the file in a single pass, it expects all results from +the same query to be in consecutive rows. If the results from one query are +spread in nonconsecutive rows, BlatIO will consider them to be separate +QueryResult objects. + +In most cases, the PSL(X) format uses the same coordinate system as Python +(zero-based, half open). These coordinates are anchored on the plus strand. +However, if the query aligns on the minus strand, BLAT will anchor the qStarts +coordinates on the minus strand instead. BlatIO is aware of this, and will +re-anchor the qStarts coordinates to the plus strand whenever it sees a minus +strand query match. Conversely, when you write out to a PSL(X) file, BlatIO will +reanchor qStarts to the minus strand again. + +BlatIO provides the following attribute-column mapping: + ++----------------+-------------------------+-----------------------------------+ +| Object | Attribute | Column Name, Value | ++================+=========================+===================================+ +| QueryResutl | id | Q name, query sequence ID | +| +-------------------------+-----------------------------------+ +| | seq_len | Q size, query sequence full | +| | | length | ++----------------+-------------------------+-----------------------------------+ +| Hit | id | T name, hit sequence ID | +| +-------------------------+-----------------------------------+ +| | seq_len | T size, hit sequence full length | ++----------------+-------------------------+-----------------------------------+ +| HSP | hit_end | T end, end coordinate of the last | +| | | hit fragment | +| +-------------------------+-----------------------------------+ +| | hit_gap_num | T gap bases, number of bases | +| | | inserted in hit | +| +-------------------------+-----------------------------------+ +| | hit_gapopen_num | T gap count, number of hit gap | +| | | inserts | +| +-------------------------+-----------------------------------+ +| | hit_span_all | blockSizes, sizes of each | +| | | fragment | +| +-------------------------+-----------------------------------+ +| | hit_start | T start, start coordinate of the | +| | | first hit fragment | +| +-------------------------+-----------------------------------+ +| | hit_start_all | tStarts, start coordinate of each | +| | | hit fragment | +| +-------------------------+-----------------------------------+ +| | match_num | match, number of non-repeat | +| | | matches | +| +-------------------------+-----------------------------------+ +| | mismatch_num | mismatch, number of mismatches | +| +-------------------------+-----------------------------------+ +| | match_rep_num | rep. match, number of matches | +| | | that are part of repeats | +| +-------------------------+-----------------------------------+ +| | n_num | N's, number of N bases | +| +-------------------------+-----------------------------------+ +| | query_end | Q end, end coordinate of the last | +| +-------------------------+-----------------------------------+ +| | | query fragment | +| | query_gap_num | Q gap bases, number of bases | +| | | inserted in query | +| +-------------------------+-----------------------------------+ +| | query_gapopen_num | Q gap count, number of query gap | +| | | inserts | +| +-------------------------+-----------------------------------+ +| | query_span_all | blockSizes, sizes of each | +| | | fragment | +| +-------------------------+-----------------------------------+ +| | query_start | Q start, start coordinate of the | +| | | first query block | +| +-------------------------+-----------------------------------+ +| | query_start_all | qStarts, start coordinate of each | +| | | query fragment | +| +-------------------------+-----------------------------------+ +| | len [*]_ | block count, the number of blocks | +| | | in the alignment | ++----------------+-------------------------+-----------------------------------+ +| HSPFragment | hit | hit sequence, if present | +| +-------------------------+-----------------------------------+ +| | hit_strand | strand, hit sequence strand | +| +-------------------------+-----------------------------------+ +| | query | query sequence, if present | +| +-------------------------+-----------------------------------+ +| | query_strand | strand, query sequence strand | ++----------------+-------------------------+-----------------------------------+ + +In addition to the column mappings above, BlatIO also provides the following +object attributes: + ++----------------+-------------------------+-----------------------------------+ +| Object | Attribute | Value | ++================+=========================+===================================+ +| HSP | gapopen_num | Q gap count + T gap count, total | +| | | number of gap openings | +| +-------------------------+-----------------------------------+ +| | ident_num | matches + repmatches, total | +| | | number of identical residues | +| +-------------------------+-----------------------------------+ +| | ident_pct | percent identity, calculated | +| | | using UCSC's formula | +| +-------------------------+-----------------------------------+ +| | query_is_protein | boolean, whether the query | +| | | sequence is a protein | +| +-------------------------+-----------------------------------+ +| | score | HSP score, calculated using | +| | | UCSC's formula | ++----------------+-------------------------+-----------------------------------+ + +Finally, the default HSP and HSPFragment properties are also provided. See the +HSP and HSPFragment documentation for more details on these properties. + + +.. [*] You can obtain the number of blocks / fragments in the HSP by invoking + ``len`` on the HSP + +""" +import re +from math import log + +from Bio.SearchIO._index import SearchIndexer +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment + + +__all__ = ("BlatPslParser", "BlatPslIndexer", "BlatPslWriter") + + +# precompile regex patterns +_PTR_ROW_CHECK = r"^\d+\s+\d+\s+\d+\s+\d+" +_RE_ROW_CHECK = re.compile(_PTR_ROW_CHECK) +_RE_ROW_CHECK_IDX = re.compile(_PTR_ROW_CHECK.encode()) + + +def _list_from_csv(csv_string, caster=None): + """Transform the given comma-separated string into a list (PRIVATE). + + :param csv_string: comma-separated input string + :type csv_string: string + :param caster: function used to cast each item in the input string + to its intended type + :type caster: callable, accepts string, returns object + + """ + if caster is None: + return [x for x in csv_string.split(",") if x] + else: + return [caster(x) for x in csv_string.split(",") if x] + + +def _reorient_starts(starts, blksizes, seqlen, strand): + """Reorients block starts into the opposite strand's coordinates (PRIVATE). + + :param starts: start coordinates + :type starts: list [int] + :param blksizes: block sizes + :type blksizes: list [int] + :param seqlen: sequence length + :type seqlen: int + :param strand: sequence strand + :type strand: int, choice of -1, 0, or 1 + + """ + if len(starts) != len(blksizes): + raise RuntimeError( + "Unequal start coordinates and block sizes list (%r vs %r)" + % (len(starts), len(blksizes)) + ) + # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html + # no need to reorient if it's already the positive strand + if strand >= 0: + return starts + else: + # the plus-oriented coordinate is calculated by this: + # plus_coord = length - minus_coord - block_size + return [seqlen - start - blksize for start, blksize in zip(starts, blksizes)] + + +def _is_protein(psl): + """Validate if psl is protein (PRIVATE).""" + # check if query is protein or not + # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4 + if len(psl["strand"]) == 2: + if psl["strand"][1] == "+": + return psl["tend"] == psl["tstarts"][-1] + 3 * psl["blocksizes"][-1] + elif psl["strand"][1] == "-": + return psl["tstart"] == psl["tsize"] - ( + psl["tstarts"][-1] + 3 * psl["blocksizes"][-1] + ) + + return False + + +def _calc_millibad(psl, is_protein): + """Calculate millibad (PRIVATE).""" + # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4 + size_mul = 3 if is_protein else 1 + millibad = 0 + + qali_size = size_mul * (psl["qend"] - psl["qstart"]) + tali_size = psl["tend"] - psl["tstart"] + ali_size = min(qali_size, tali_size) + if ali_size <= 0: + return 0 + + size_dif = qali_size - tali_size + size_dif = 0 if size_dif < 0 else size_dif + + total = size_mul * (psl["matches"] + psl["repmatches"] + psl["mismatches"]) + if total != 0: + millibad = ( + 1000 + * ( + psl["mismatches"] * size_mul + + psl["qnuminsert"] + + round(3 * log(1 + size_dif)) + ) + ) / total + + return millibad + + +def _calc_score(psl, is_protein): + """Calculate score (PRIVATE).""" + # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4 + size_mul = 3 if is_protein else 1 + return ( + size_mul * (psl["matches"] + (psl["repmatches"] >> 1)) + - size_mul * psl["mismatches"] + - psl["qnuminsert"] + - psl["tnuminsert"] + ) + + +def _create_hsp(hid, qid, psl): + """Create high scoring pair object (PRIVATE).""" + # protein flag + is_protein = _is_protein(psl) + # strand + # if query is protein, strand is 0 + if is_protein: + qstrand = 0 + else: + qstrand = 1 if psl["strand"][0] == "+" else -1 + # try to get hit strand, if it exists + try: + hstrand = 1 if psl["strand"][1] == "+" else -1 + except IndexError: + hstrand = 1 # hit strand defaults to plus + + blocksize_multiplier = 3 if is_protein else 1 + # query block starts + qstarts = _reorient_starts(psl["qstarts"], psl["blocksizes"], psl["qsize"], qstrand) + # hit block starts + if len(psl["strand"]) == 2: + hstarts = _reorient_starts( + psl["tstarts"], + [blocksize_multiplier * i for i in psl["blocksizes"]], + psl["tsize"], + hstrand, + ) + else: + hstarts = psl["tstarts"] + # set query and hit coords + # this assumes each block has no gaps (which seems to be the case) + assert len(qstarts) == len(hstarts) == len(psl["blocksizes"]) + query_range_all = list( + zip(qstarts, [x + y for x, y in zip(qstarts, psl["blocksizes"])]) + ) + hit_range_all = list( + zip( + hstarts, + [x + y * blocksize_multiplier for x, y in zip(hstarts, psl["blocksizes"])], + ) + ) + # check length of sequences and coordinates, all must match + if "tseqs" in psl and "qseqs" in psl: + assert ( + len(psl["tseqs"]) + == len(psl["qseqs"]) + == len(query_range_all) + == len(hit_range_all) + ) + else: + assert len(query_range_all) == len(hit_range_all) + + frags = [] + # iterating over query_range_all, but hit_range_all works just as well + for idx, qcoords in enumerate(query_range_all): + hseqlist = psl.get("tseqs") + hseq = "" if not hseqlist else hseqlist[idx] + qseqlist = psl.get("qseqs") + qseq = "" if not qseqlist else qseqlist[idx] + frag = HSPFragment(hid, qid, hit=hseq, query=qseq) + # set molecule type + frag.molecule_type = "DNA" + # set coordinates + frag.query_start = qcoords[0] + frag.query_end = qcoords[1] + frag.hit_start = hit_range_all[idx][0] + frag.hit_end = hit_range_all[idx][1] + # and strands + frag.query_strand = qstrand + frag.hit_strand = hstrand + frags.append(frag) + + # create hsp object + hsp = HSP(frags) + # check if start and end are set correctly + assert hsp.query_start == psl["qstart"] + assert hsp.query_end == psl["qend"] + assert hsp.hit_start == psl["tstart"] + assert hsp.hit_end == psl["tend"] + # and check block spans as well + hit_spans = [span / blocksize_multiplier for span in hsp.hit_span_all] + assert hit_spans == hsp.query_span_all == psl["blocksizes"] + # set its attributes + hsp.match_num = psl["matches"] + hsp.mismatch_num = psl["mismatches"] + hsp.match_rep_num = psl["repmatches"] + hsp.n_num = psl["ncount"] + hsp.query_gapopen_num = psl["qnuminsert"] + hsp.query_gap_num = psl["qbaseinsert"] + hsp.hit_gapopen_num = psl["tnuminsert"] + hsp.hit_gap_num = psl["tbaseinsert"] + + hsp.ident_num = psl["matches"] + psl["repmatches"] + hsp.gapopen_num = psl["qnuminsert"] + psl["tnuminsert"] + hsp.gap_num = psl["qbaseinsert"] + psl["tbaseinsert"] + hsp.query_is_protein = is_protein + hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 + hsp.score = _calc_score(psl, is_protein) + # helper flag, for writing + hsp._has_hit_strand = len(psl["strand"]) == 2 + + return hsp + + +class BlatPslParser: + """Parser for the BLAT PSL format.""" + + def __init__(self, handle, pslx=False): + """Initialize the class.""" + self.handle = handle + self.line = self.handle.readline() + self.pslx = pslx + + def __iter__(self): + """Iterate over BlatPslParser, yields query results.""" + # break out if it's an empty file + if not self.line: + return + + # read through header + # this assumes that the result row match the regex + while not re.search(_RE_ROW_CHECK, self.line.strip()): + self.line = self.handle.readline() + if not self.line: + return + + # parse into query results + for qresult in self._parse_qresult(): + qresult.program = "blat" + yield qresult + + def _parse_row(self): + """Return a dictionary of parsed column values (PRIVATE).""" + assert self.line + cols = [x for x in self.line.strip().split("\t") if x] + self._validate_cols(cols) + + psl = {} + psl["qname"] = cols[9] # qName + psl["qsize"] = int(cols[10]) # qSize + psl["tname"] = cols[13] # tName + psl["tsize"] = int(cols[14]) # tSize + psl["matches"] = int(cols[0]) # matches + psl["mismatches"] = int(cols[1]) # misMatches + psl["repmatches"] = int(cols[2]) # repMatches + psl["ncount"] = int(cols[3]) # nCount + psl["qnuminsert"] = int(cols[4]) # qNumInsert + psl["qbaseinsert"] = int(cols[5]) # qBaseInsert + psl["tnuminsert"] = int(cols[6]) # tNumInsert + psl["tbaseinsert"] = int(cols[7]) # tBaseInsert + psl["strand"] = cols[8] # strand + psl["qstart"] = int(cols[11]) # qStart + psl["qend"] = int(cols[12]) # qEnd + psl["tstart"] = int(cols[15]) # tStart + psl["tend"] = int(cols[16]) # tEnd + psl["blockcount"] = int(cols[17]) # blockCount + psl["blocksizes"] = _list_from_csv(cols[18], int) # blockSizes + psl["qstarts"] = _list_from_csv(cols[19], int) # qStarts + psl["tstarts"] = _list_from_csv(cols[20], int) # tStarts + if self.pslx: + psl["qseqs"] = _list_from_csv(cols[21]) # query sequence + psl["tseqs"] = _list_from_csv(cols[22]) # hit sequence + + return psl + + def _validate_cols(self, cols): + """Validate column's length of PSL or PSLX (PRIVATE).""" + if not self.pslx: + if len(cols) != 21: + raise ValueError( + "Invalid PSL line: %r. Expected 21 tab-separated columns, found %i" + % (self.line, len(cols)) + ) + else: + if len(cols) != 23: + raise ValueError( + "Invalid PSLX line: %r. Expected 23 tab-separated columns, found %i" + % (self.line, len(cols)) + ) + + def _parse_qresult(self): + """Yield QueryResult objects (PRIVATE).""" + # state values, determines what to do for each line + state_EOF = 0 + state_QRES_NEW = 1 + state_QRES_SAME = 3 + state_HIT_NEW = 2 + state_HIT_SAME = 4 + # initial dummy values + qres_state = None + file_state = None + cur_qid, cur_hid = None, None + prev_qid, prev_hid = None, None + cur, prev = None, None + hit_list, hsp_list = [], [] + + while True: + # store previous line's parsed values for all lines after the first + if cur is not None: + prev = cur + prev_qid = cur_qid + prev_hid = cur_hid + # only parse the result row if it's not EOF + if self.line: + cur = self._parse_row() + cur_qid = cur["qname"] + cur_hid = cur["tname"] + else: + file_state = state_EOF + # mock values, since we have nothing to parse + cur_qid, cur_hid = None, None + + # get the state of hit and qresult + if prev_qid != cur_qid: + qres_state = state_QRES_NEW + else: + qres_state = state_QRES_SAME + # new hits are hits with different ids or hits in a new qresult + if prev_hid != cur_hid or qres_state == state_QRES_NEW: + hit_state = state_HIT_NEW + else: + hit_state = state_HIT_SAME + + if prev is not None: + # create fragment and HSP and set their attributes + hsp = _create_hsp(prev_hid, prev_qid, prev) + hsp_list.append(hsp) + + if hit_state == state_HIT_NEW: + # create Hit and set its attributes + hit = Hit(hsp_list) + hit.seq_len = prev["tsize"] + hit_list.append(hit) + hsp_list = [] + + # create qresult and yield if we're at a new qresult or at EOF + if qres_state == state_QRES_NEW or file_state == state_EOF: + qresult = QueryResult(id=prev_qid) + for hit in hit_list: + qresult.absorb(hit) + qresult.seq_len = prev["qsize"] + yield qresult + # if we're at EOF, break + if file_state == state_EOF: + break + hit_list = [] + + self.line = self.handle.readline() + + +class BlatPslIndexer(SearchIndexer): + """Indexer class for BLAT PSL output.""" + + _parser = BlatPslParser + + def __init__(self, filename, pslx=False): + """Initialize the class.""" + SearchIndexer.__init__(self, filename, pslx=pslx) + + def __iter__(self): + """Iterate over the file handle; yields key, start offset, and length.""" + handle = self._handle + handle.seek(0) + # denotes column location for query identifier + query_id_idx = 9 + qresult_key = None + tab_char = b"\t" + + start_offset = handle.tell() + line = handle.readline() + # read through header + # this assumes that the result row match the regex + while not re.search(_RE_ROW_CHECK_IDX, line.strip()): + start_offset = handle.tell() + line = handle.readline() + if not line: + return + + # and index the qresults + while True: + end_offset = handle.tell() + + cols = [x for x in line.strip().split(tab_char) if x] + if qresult_key is None: + qresult_key = cols[query_id_idx] + else: + curr_key = cols[query_id_idx] + + if curr_key != qresult_key: + yield qresult_key.decode(), start_offset, end_offset - start_offset + qresult_key = curr_key + start_offset = end_offset - len(line) + + line = handle.readline() + if not line: + yield qresult_key.decode(), start_offset, end_offset - start_offset + break + + def get_raw(self, offset): + """Return raw bytes string of a QueryResult object from the given offset.""" + handle = self._handle + handle.seek(offset) + query_id_idx = 9 + qresult_key = None + qresult_raw = b"" + tab_char = b"\t" + + while True: + line = handle.readline() + if not line: + break + cols = [x for x in line.strip().split(tab_char) if x] + if qresult_key is None: + qresult_key = cols[query_id_idx] + else: + curr_key = cols[query_id_idx] + if curr_key != qresult_key: + break + qresult_raw += line + + return qresult_raw + + +class BlatPslWriter: + """Writer for the blat-psl format.""" + + def __init__(self, handle, header=False, pslx=False): + """Initialize the class.""" + self.handle = handle + # flag for writing header or not + self.header = header + self.pslx = pslx + + def write_file(self, qresults): + """Write query results to file.""" + handle = self.handle + qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 + + if self.header: + handle.write(self._build_header()) + + for qresult in qresults: + if qresult: + handle.write(self._build_row(qresult)) + qresult_counter += 1 + hit_counter += len(qresult) + hsp_counter += sum(len(hit) for hit in qresult) + frag_counter += sum(len(hit.fragments) for hit in qresult) + + return qresult_counter, hit_counter, hsp_counter, frag_counter + + def _build_header(self): + """Build header, tab-separated string (PRIVATE).""" + # for now, always use the psLayout version 3 + header = "psLayout version 3\n" + + # adapted from BLAT's source: lib/psl.c#L496 + header += ( + "\nmatch\tmis- \trep. \tN's\tQ gap\tQ gap\tT gap\tT " + "gap\tstrand\tQ \tQ \tQ \tQ \tT \tT " + "\tT \tT \tblock\tblockSizes \tqStarts\t tStarts" + "\n \tmatch\tmatch\t \tcount\tbases\tcount\tbases" + "\t \tname \tsize\tstart\tend\tname \tsize" + "\tstart\tend\tcount\n%s\n" % ("-" * 159) + ) + + return header + + def _build_row(self, qresult): + """Return a string or one row or more of the QueryResult object (PRIVATE).""" + # For now, our writer writes the row according to the order in + # the QueryResult and Hit objects. + # This is different from BLAT's native output, where the rows are + # grouped by strand. + # Should we tweak the behavior to better mimic the native output? + qresult_lines = [] + + for hit in qresult: + for hsp in hit.hsps: + + query_is_protein = getattr(hsp, "query_is_protein", False) + blocksize_multiplier = 3 if query_is_protein else 1 + + line = [] + line.append(hsp.match_num) + line.append(hsp.mismatch_num) + line.append(hsp.match_rep_num) + line.append(hsp.n_num) + line.append(hsp.query_gapopen_num) + line.append(hsp.query_gap_num) + line.append(hsp.hit_gapopen_num) + line.append(hsp.hit_gap_num) + + # check spans + eff_query_spans = [blocksize_multiplier * s for s in hsp.query_span_all] + if hsp.hit_span_all != eff_query_spans: + raise ValueError("HSP hit span and query span values do not match.") + block_sizes = hsp.query_span_all + + # set strand and starts + if hsp[0].query_strand >= 0: # since it may be a protein seq + strand = "+" + else: + strand = "-" + qstarts = _reorient_starts( + [x[0] for x in hsp.query_range_all], + hsp.query_span_all, + qresult.seq_len, + hsp[0].query_strand, + ) + + if hsp[0].hit_strand == 1: + hstrand = 1 + # only write hit strand if it was present in the source file + if hsp._has_hit_strand: + strand += "+" + else: + hstrand = -1 + strand += "-" + hstarts = _reorient_starts( + [x[0] for x in hsp.hit_range_all], + hsp.hit_span_all, + hit.seq_len, + hstrand, + ) + + line.append(strand) + line.append(qresult.id) + line.append(qresult.seq_len) + line.append(hsp.query_start) + line.append(hsp.query_end) + line.append(hit.id) + line.append(hit.seq_len) + line.append(hsp.hit_start) + line.append(hsp.hit_end) + line.append(len(hsp)) + line.append(",".join(str(x) for x in block_sizes) + ",") + line.append(",".join(str(x) for x in qstarts) + ",") + line.append(",".join(str(x) for x in hstarts) + ",") + + if self.pslx: + line.append(",".join(str(x.seq) for x in hsp.query_all) + ",") + line.append(",".join(str(x.seq) for x in hsp.hit_all) + ",") + + qresult_lines.append("\t".join(str(x) for x in line)) + + return "\n".join(qresult_lines) + "\n" + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/ExonerateIO/__init__.py b/code/lib/Bio/SearchIO/ExonerateIO/__init__.py new file mode 100644 index 0000000..7aaa9f5 --- /dev/null +++ b/code/lib/Bio/SearchIO/ExonerateIO/__init__.py @@ -0,0 +1,252 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO support for Exonerate output formats. + +This module adds support for handling Exonerate outputs. Exonerate is a generic +tool for pairwise sequence comparison that allows you to align sequences using +several different models. + +Bio.SearchIO.ExonerateIO was tested on the following Exonerate versions and +models: + + - version: 2.2 + - models: + - affine:local - cdna2genome + - coding2coding - est2genome + - genome2genome - ner + - protein2dna - protein2genome + - ungapped - ungapped:translated + +Although model testing were not exhaustive, ExonerateIO should be able to cope +with all Exonerate models. Please file a bug report if you stumble upon an +unparseable file. + +More information on Exonerate is available on its home page at +www.ebi.ac.uk/~guy/exonerate/ + + +Supported Formats +================= + + - Plain text alignment - 'exonerate-text' - parsing, indexing + - Vulgar line - 'exonerate-vulgar' - parsing, indexing + - Cigar line - 'exonerate-cigar' - parsing, indexing + +On Exonerate, these output formats are not exclusive to one another. For +example, you may have both plain text and vulgar output in the same file. +ExonerateIO can only handle one of these at a time, however. If you have a file +containing both plain text and vulgar lines, for example, you have to pick +either 'exonerate-text' or 'exonerate-vulgar' to parse it. + +Due to the cigar format specification, many features of the alignments such as +introns or frameshifts may be collapsed into a single feature (in this case, +they are labelled 'D' for 'deletion'). The parser does not attempt to guess +whether the D label it encounters is a real deletion or a collapsed feature. +As such, parsing or indexing using 'exonerate-cigar' may yield different results +compared to 'exonerate-text' or 'exonerate-vulgar'. + + +exonerate-text +============== + +The plain text output / C4 alignment is the output triggered by the +'--showalignemnt' flag. Compared to the two other output formats, this format +contains the most information, having the complete query and hit sequences of +the alignment. + +Here are some examples of the C4 output alignment that ExonerateIO can handle +(coordinates not written in scale):: + + 1. simple ungapped alignments + + 1 : ATGGGCAATATCCTTCGGAAAGGTCAGCAAAT : 56 + |||||||||||||||||||||||||||||||| + 1319275 : ATGGGCAATATCCTTCGGAAAGGTCAGCAAAT : 1319220 + + 2. alignments with frameshifts: + + 129 : -TGCCGTTACCAT----GACGAAAGTATTAAT : 160 + -CysArgTyrHis----AspGluSerIleAsn + #||||||||||||####||||||||||||||| + #CysArgTyrHis####AspGluSerIleAsn + 1234593 : GTGCCGTTACCATCGGTGACGAAAGTATTAAT : 1234630 + + 3. alignments with introns and split codons: + + 382 : {A} {CC}AAA : 358 + AAA{T} >>>> Target Intron 3 >>>> {hr}LysATGAGCGATGAAAATA + || { }++ 55423 bp ++{ } ! ||| |||||||||| + AAC{L}gt.........................ag{eu}AspTTGAATGATGAAAATA + 42322 : {C} {TG}GAT : 97769 + + 4. alignments with NER blocks + + 111 : CAGAAAA--< 31 >--CTGCCCAGAAT--< 10 >--AACGAGCGTTCCG- : 184 + | |||||--< NER 1 >--| ||||| | |--< NER 2 >--||| | ||||||- + 297911 : CTGAAAA--< 29 >--CCGCCCAAAGT--< 13 >--AACTGGAGTTCCG- : 297993 + +ExonerateIO utilizes the HSPFragment model quite extensively to deal with non- +ungapped alignments. For any single HSPFragment, if ExonerateIO sees an intron, +a NER block, or a frameshift, it will break the fragment into two HSPFragment +objects and adjust each of their start and end coordinate appropriately. + +You may notice that Exonerate always uses the three letter amino acid codes to +display protein sequences. If the protein itself is part of the query sequence, +such as in the protein2dna model, ExonerateIO will transform the protein +sequence into using one letter codes. This is because the SeqRecord objects that +store the sequences are designed for single-letter sequences only. If Exonerate +also outputs the underlying nucleotide sequence, it will be saved into an +``aln_annotation`` entry as a list of triplets. + +If the protein sequence is not part of the actual alignment, such as in the +est2genome or genome2genome models, ExonerateIO will keep the three letter codes +and store them as ``aln_annotation`` entries. In these cases, the hit and +query sequences may be used directly as SeqRecord objects as they are one-letter +nucleotide codes. The three-letter protein sequences are then stored as entries +in the ``aln_annotation`` dictionary. + + +For 'exonerate-text', ExonerateIO provides the following object attributes: + ++-----------------+-------------------------+----------------------------------+ +| Object | Attribute | Value | ++=================+=========================+==================================+ +| QueryResult | description | query sequence description | +| +-------------------------+----------------------------------+ +| | id | query sequence ID | +| +-------------------------+----------------------------------+ +| | model | alignment model | +| +-------------------------+----------------------------------+ +| | program | 'exonerate' | ++-----------------+-------------------------+----------------------------------+ +| Hit | description | hit sequence description | +| +-------------------------+----------------------------------+ +| | id | hit sequence ID | ++-----------------+-------------------------+----------------------------------+ +| HSP | hit_split_codons | list of split codon coordinates | +| | | in the hit sequence | +| +-------------------------+----------------------------------+ +| | score | alignment score | +| +-------------------------+----------------------------------+ +| | query_split_codons | list of split codon coordinates | +| | | in the query sequence | ++-----------------+-------------------------+----------------------------------+ +| HSPFragment | aln_annotation | alignment similarity string, hit | +| | | sequence annotation, and/or | +| | | query sequence annotation | +| +-------------------------+----------------------------------+ +| | hit | hit sequence | +| +-------------------------+----------------------------------+ +| | hit_end | hit sequence end coordinate | +| +-------------------------+----------------------------------+ +| | hit_frame | hit sequence reading frame | +| +-------------------------+----------------------------------+ +| | hit_start | hit sequence start coordinate | +| +-------------------------+----------------------------------+ +| | hit_strand | hit sequence strand | +| +-------------------------+----------------------------------+ +| | query | query sequence | +| +-------------------------+----------------------------------+ +| | query_end | query sequence end coordinate | +| +-------------------------+----------------------------------+ +| | query_frame | query sequence reading frame | +| +-------------------------+----------------------------------+ +| | query_start | query sequence start coordinate | +| +-------------------------+----------------------------------+ +| | query_strand | query sequence strand | ++-----------------+-------------------------+----------------------------------+ + +Note that you can also use the default HSP or HSPFragment properties. For +example, to check the intron coordinates of your result you can use the +``query_inter_ranges`` or ``hit_inter_ranges`` properties: + + >>> from Bio import SearchIO + >>> fname = 'Exonerate/exn_22_m_genome2genome.exn' + >>> all_qresult = list(SearchIO.parse(fname, 'exonerate-text')) + >>> hsp = all_qresult[-1][-1][-1] # last qresult, last hit, last hsp + >>> hsp + HSP(...) + >>> hsp.query_inter_ranges + [(388, 449), (284, 319), (198, 198), (114, 161)] + >>> hsp.hit_inter_ranges + [(487387, 641682), (386207, 487327), (208677, 386123), (71917, 208639)] + +Here you can see that for both query and hit introns, the coordinates +in each tuple is always (start, end) where start <= end. But when you compare +each tuple to the next, the coordinates decrease. This is an indication that +both the query and hit sequences lie on the minus strand. Exonerate outputs +minus strand results in a decreasing manner; the start coordinate is always +bigger than the end coordinate. ExonerateIO preserves the fragment ordering as a +whole, but uses its own standard to store an individual fragment's start and end +coordinates. + +You may also notice that the third tuple in ``query_inter_ranges`` is (198, 198), +two exact same numbers. This means that the query sequence does not have any +gaps at that position. The gap is only present in the hit sequence, where we see +that the third tuple contains (208677, 386123), a gap of about 177k bases. + +Another example is to use the ``hit_frame_all`` and ``query_frame_all`` to see if +there are any frameshifts in your alignment: + + >>> from Bio import SearchIO + >>> fname = 'Exonerate/exn_22_m_coding2coding_fshifts.exn' + >>> qresult = next(SearchIO.parse(fname, 'exonerate-text')) + >>> hsp = qresult[0][0] # first hit, first hsp + >>> hsp + HSP(...) + >>> hsp.query_frame_all + [1, 2, 2, 2] + >>> hsp.hit_frame_all + [1, 1, 3, 1] + +Here you can see that the alignment as a whole has three frameshifts. The first +one occurs in the query sequence, after the first fragment (1 -> 2 shift), the +second one occurs in the hit sequence, after the second fragment (1 -> 3 shift), +and the last one also occurs in the hit sequence, before the last fragment (3 -> +1 shift). + +There are other default HSP properties that you can use to ease your workflow. +Please refer to the HSP object documentation for more details. + + +exonerate-vulgar +================ + +The vulgar format provides a compact way of representing alignments created by +Exonerate. In general, it contains the same information as the plain text output +except for the 'model' information and the actual sequences themselves. You can +expect that the coordinates obtained from using 'exonerate-text' and +'exonerate-vulgar' to be the same. Both formats also creates HSPFragment using +the same triggers: introns, NER blocks, and/or frameshifts. + + +exonerate-cigar +=============== + +The cigar format provides an even more compact representation of Exonerate +alignments. However, this comes with a cost of losing information. In the cigar +format, for example, introns are treated as simple deletions. This makes it +impossible for the parser to distinguish between simple deletions or intron +regions. As such, 'exonerate-cigar' may produce different sets of coordinates +and fragments compared to 'exonerate-vulgar' or 'exonerate-text'. + +""" + +# Known issues & gotchas: +# - The cigar parser does not use the extended cigar string; only supports MID +# - Cigar and vulgar parsing results will most likely be different, due to the +# different type of data stored by both formats + +from .exonerate_text import ExonerateTextParser, ExonerateTextIndexer +from .exonerate_vulgar import ExonerateVulgarParser, ExonerateVulgarIndexer +from .exonerate_cigar import ExonerateCigarParser, ExonerateCigarIndexer + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..f32a000 Binary files /dev/null and b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/_base.cpython-37.pyc b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/_base.cpython-37.pyc new file mode 100644 index 0000000..8757a74 Binary files /dev/null and b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/_base.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_cigar.cpython-37.pyc b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_cigar.cpython-37.pyc new file mode 100644 index 0000000..7aec49b Binary files /dev/null and b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_cigar.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_text.cpython-37.pyc b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_text.cpython-37.pyc new file mode 100644 index 0000000..9c0fe21 Binary files /dev/null and b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_text.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_vulgar.cpython-37.pyc b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_vulgar.cpython-37.pyc new file mode 100644 index 0000000..fab1cfe Binary files /dev/null and b/code/lib/Bio/SearchIO/ExonerateIO/__pycache__/exonerate_vulgar.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/ExonerateIO/_base.py b/code/lib/Bio/SearchIO/ExonerateIO/_base.py new file mode 100644 index 0000000..190f80a --- /dev/null +++ b/code/lib/Bio/SearchIO/ExonerateIO/_base.py @@ -0,0 +1,534 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO abstract base parser for Exonerate standard output format.""" + +import re +from functools import reduce +from abc import ABC, abstractmethod + +from Bio.SearchIO._index import SearchIndexer +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment +from Bio.SeqUtils import seq1 + + +# strand char-value mapping +_STRAND_MAP = {"+": 1, "-": -1, ".": 0} + +_RE_SHIFTS = re.compile(r"(#+)") +# regex for checking whether a vulgar line has protein/translated components +_RE_TRANS = re.compile(r"[53ISCF]") + + +def _set_frame(frag): + """Set the HSPFragment frames (PRIVATE).""" + frag.hit_frame = (frag.hit_start % 3 + 1) * frag.hit_strand + frag.query_frame = (frag.query_start % 3 + 1) * frag.query_strand + + +def _make_triplets(seq, phase=0): + """Select a valid amino acid sequence given a 3-letter code input (PRIVATE). + + This function takes a single three-letter amino acid sequence and the phase + of the sequence to return the longest intact amino acid sequence possible. + Parts of the input sequence before and after the selected sequence are also + returned. + + This is an internal private function and is meant for parsing Exonerate's + three-letter amino acid output. + + >>> from Bio.SearchIO.ExonerateIO._base import _make_triplets + >>> _make_triplets('GlyThrSerAlaPro') + ('', ['Gly', 'Thr', 'Ser', 'Ala', 'Pro'], '') + >>> _make_triplets('yThrSerAla', phase=1) + ('y', ['Thr', 'Ser', 'Ala'], '') + >>> _make_triplets('yThrSerAlaPr', phase=1) + ('y', ['Thr', 'Ser', 'Ala'], 'Pr') + + """ + pre = seq[:phase] + np_seq = seq[phase:] + non_triplets = len(np_seq) % 3 + post = "" if not non_triplets else np_seq[-1 * non_triplets :] + intacts = [np_seq[3 * i : 3 * (i + 1)] for i in range(len(np_seq) // 3)] + return pre, intacts, post + + +def _get_fragments_coord(frags): + """Return the letter coordinate of the given list of fragments (PRIVATE). + + This function takes a list of three-letter amino acid sequences and + returns a list of coordinates for each fragment had all the input + sequences been flattened. + + This is an internal private function and is meant for parsing Exonerate's + three-letter amino acid output. + + >>> from Bio.SearchIO.ExonerateIO._base import _get_fragments_coord + >>> _get_fragments_coord(['Thr', 'Ser', 'Ala']) + [0, 3, 6] + >>> _get_fragments_coord(['Thr', 'SerAlaPro', 'GlyLeu']) + [0, 3, 12] + >>> _get_fragments_coord(['Thr', 'SerAlaPro', 'GlyLeu', 'Cys']) + [0, 3, 12, 18] + + """ + if not frags: + return [] + # first fragment always starts from position 0 + init = [0] + return reduce(lambda acc, frag: acc + [acc[-1] + len(frag)], frags[:-1], init) + + +def _get_fragments_phase(frags): + """Return the phases of the given list of 3-letter amino acid fragments (PRIVATE). + + This is an internal private function and is meant for parsing Exonerate's + three-letter amino acid output. + + >>> from Bio.SearchIO.ExonerateIO._base import _get_fragments_phase + >>> _get_fragments_phase(['Thr', 'Ser', 'Ala']) + [0, 0, 0] + >>> _get_fragments_phase(['ThrSe', 'rAla']) + [0, 1] + >>> _get_fragments_phase(['ThrSe', 'rAlaLeu', 'ProCys']) + [0, 1, 0] + >>> _get_fragments_phase(['ThrSe', 'rAlaLeuP', 'roCys']) + [0, 1, 2] + >>> _get_fragments_phase(['ThrSe', 'rAlaLeuPr', 'oCys']) + [0, 1, 1] + + """ + return [(3 - (x % 3)) % 3 for x in _get_fragments_coord(frags)] + + +def _adjust_aa_seq(fraglist): + """Transform 3-letter AA codes of input fragments to one-letter codes (PRIVATE). + + Argument fraglist should be a list of HSPFragments objects. + """ + custom_map = {"***": "*", "<->": "-"} + hsp_hstart = fraglist[0].hit_start + hsp_qstart = fraglist[0].query_start + frag_phases = _get_fragments_phase(fraglist) + for frag, phase in zip(fraglist, frag_phases): + assert frag.query_strand == 0 or frag.hit_strand == 0 + # hit step may be -1 as we're aligning to DNA + hstep = 1 if frag.hit_strand >= 0 else -1 + + # set fragment phase + frag.phase = phase + + # fragment should have a length that is a multiple of 3 + # assert len(frag) % 3 == 0 + qseq = str(frag.query.seq) + q_triplets_pre, q_triplets, q_triplets_post = _make_triplets(qseq, phase) + + hseq = str(frag.hit.seq) + h_triplets_pre, h_triplets, h_triplets_post = _make_triplets(hseq, phase) + + # get one letter codes + # and replace gap codon markers and termination characters + hseq1_pre = "X" if h_triplets_pre else "" + hseq1_post = "X" if h_triplets_post else "" + hseq1 = seq1("".join(h_triplets), custom_map=custom_map) + hstart = hsp_hstart + (len(hseq1_pre) * hstep) + hend = hstart + len(hseq1.replace("-", "")) * hstep + + qseq1_pre = "X" if q_triplets_pre else "" + qseq1_post = "X" if q_triplets_post else "" + qseq1 = seq1("".join(q_triplets), custom_map=custom_map) + qstart = hsp_qstart + len(qseq1_pre) + qend = qstart + len(qseq1.replace("-", "")) + + # replace the old frag sequences with the new ones + frag.hit = None + frag.query = None + frag.hit = hseq1_pre + hseq1 + hseq1_post + frag.query = qseq1_pre + qseq1 + qseq1_post + + # set coordinates for the protein sequence + if frag.query_strand == 0: + frag.query_start, frag.query_end = qstart, qend + elif frag.hit_strand == 0: + frag.hit_start, frag.hit_end = hstart, hend + + # update alignment annotation + # by turning them into list of triplets + for annot, annotseq in frag.aln_annotation.items(): + pre, intact, post = _make_triplets(annotseq, phase) + frag.aln_annotation[annot] = ( + list(filter(None, [pre])) + intact + list(filter(None, [post])) + ) + + # update values for next iteration + hsp_hstart, hsp_qstart = hend, qend + + return fraglist + + +def _split_fragment(frag): + """Split one HSPFragment containing frame-shifted alignment into two (PRIVATE).""" + # given an HSPFragment object with frameshift(s), this method splits it + # into fragments without frameshifts by sequentially chopping it off + # starting from the beginning + simil = frag.aln_annotation["similarity"] + # we should have at least 1 frame shift for splitting + assert simil.count("#") > 0 + + split_frags = [] + qstep = 1 if frag.query_strand >= 0 else -1 + hstep = 1 if frag.hit_strand >= 0 else -1 + qpos = min(frag.query_range) if qstep >= 0 else max(frag.query_range) + hpos = min(frag.hit_range) if qstep >= 0 else max(frag.hit_range) + abs_pos = 0 + # split according to hit, then query + while simil: + + try: + shifts = re.search(_RE_SHIFTS, simil).group(1) + s_start = simil.find(shifts) + s_stop = s_start + len(shifts) + split = frag[abs_pos : abs_pos + s_start] + except AttributeError: # no '#' in simil, i.e. last frag + shifts = "" + s_start = 0 + s_stop = len(simil) + split = frag[abs_pos:] + + # coordinates for the split strand + qstart, hstart = qpos, hpos + qpos += ( + len(split) - sum(split.query.seq.count(x) for x in ("-", "<", ">")) + ) * qstep + hpos += ( + len(split) - sum(split.hit.seq.count(x) for x in ("-", "<", ">")) + ) * hstep + + split.hit_start = min(hstart, hpos) + split.query_start = min(qstart, qpos) + split.hit_end = max(hstart, hpos) + split.query_end = max(qstart, qpos) + + # account for frameshift length + abs_slice = slice(abs_pos + s_start, abs_pos + s_stop) + if len(frag.aln_annotation) == 2: + seqs = (frag[abs_slice].query.seq, frag[abs_slice].hit.seq) + elif len(frag.aln_annotation) == 3: + seqs = ( + frag[abs_slice].aln_annotation["query_annotation"], + frag[abs_slice].aln_annotation["hit_annotation"], + ) + if "#" in seqs[0]: + qpos += len(shifts) * qstep + elif "#" in seqs[1]: + hpos += len(shifts) * hstep + + # set frame + _set_frame(split) + split_frags.append(split) + # set similarity string and absolute position for the next loop + simil = simil[s_stop:] + abs_pos += s_stop + + return split_frags + + +def _create_hsp(hid, qid, hspd): + """Return a list of HSP objects from the given parsed HSP values (PRIVATE).""" + frags = [] + # we are iterating over query_ranges, but hit_ranges works just as well + for idx, qcoords in enumerate(hspd["query_ranges"]): + # get sequences, create object + hseqlist = hspd.get("hit") + hseq = "" if hseqlist is None else hseqlist[idx] + qseqlist = hspd.get("query") + qseq = "" if qseqlist is None else qseqlist[idx] + frag = HSPFragment(hid, qid, hit=hseq, query=qseq) + # coordinates + frag.query_start = qcoords[0] + frag.query_end = qcoords[1] + frag.hit_start = hspd["hit_ranges"][idx][0] + frag.hit_end = hspd["hit_ranges"][idx][1] + # alignment annotation + try: + aln_annot = hspd.get("aln_annotation", {}) + for key, value in aln_annot.items(): + frag.aln_annotation[key] = value[idx] + except IndexError: + pass + # strands + frag.query_strand = hspd["query_strand"] + frag.hit_strand = hspd["hit_strand"] + # and append the hsp object to the list + if frag.aln_annotation.get("similarity") is not None: + if "#" in frag.aln_annotation["similarity"]: + frags.extend(_split_fragment(frag)) + continue + # try to set frame if there are translation in the alignment + if ( + len(frag.aln_annotation) > 1 + or frag.query_strand == 0 + or ("vulgar_comp" in hspd and re.search(_RE_TRANS, hspd["vulgar_comp"])) + ): + _set_frame(frag) + + frags.append(frag) + + # if the query is protein, we need to change the hit and query sequences + # from three-letter amino acid codes to one letter, and adjust their + # coordinates accordingly + if len(frags[0].aln_annotation) == 2: # 2 annotations == protein query + frags = _adjust_aa_seq(frags) + + hsp = HSP(frags) + # set hsp-specific attributes + for attr in ( + "score", + "hit_split_codons", + "query_split_codons", + "model", + "vulgar_comp", + "cigar_comp", + "molecule_type", + ): + if attr in hspd: + setattr(hsp, attr, hspd[attr]) + + return hsp + + +def _parse_hit_or_query_line(line): + """Parse the 'Query:' line of exonerate alignment outputs (PRIVATE).""" + try: + mark, id, desc = line.split(" ", 2) + except ValueError: # no desc + mark, id = line.split(" ", 1) + desc = "" + + return id, desc + + +class _BaseExonerateParser(ABC): + """Abstract base class iterator for exonerate format.""" + + _ALN_MARK = None + + def __init__(self, handle): + self.handle = handle + self.has_c4_alignment = False + + def __iter__(self): + # read line until the first alignment block or cigar/vulgar lines + while True: + self.line = self.handle.readline() + # flag for human-readable alignment block + if self.line.startswith("C4 Alignment:") and not self.has_c4_alignment: + self.has_c4_alignment = True + if ( + self.line.startswith("C4 Alignment:") + or self.line.startswith("vulgar:") + or self.line.startswith("cigar:") + ): + break + elif not self.line or self.line.startswith("-- completed "): + return + + for qresult in self._parse_qresult(): + qresult.program = "exonerate" + # HACK: so that all descriptions are set + qresult.description = qresult.description + for hit in qresult: + hit.description = hit.description + yield qresult + + def read_until(self, bool_func): + """Read the file handle until the given bool function returns True.""" + while True: + if not self.line or bool_func(self.line): + return + else: + self.line = self.handle.readline() + + @abstractmethod + def parse_alignment_block(self, header): + raise NotImplementedError + + def _parse_alignment_header(self): + # read all header lines and store them + aln_header = [] + # header is everything before the first empty line + while self.line.strip(): + aln_header.append(self.line.strip()) + self.line = self.handle.readline() + # then parse them + qresult, hit, hsp = {}, {}, {} + for line in aln_header: + # query line + if line.startswith("Query:"): + qresult["id"], qresult["description"] = _parse_hit_or_query_line(line) + # target line + elif line.startswith("Target:"): + hit["id"], hit["description"] = _parse_hit_or_query_line(line) + # model line + elif line.startswith("Model:"): + qresult["model"] = line.split(" ", 1)[1] + # score line + elif line.startswith("Raw score:"): + hsp["score"] = line.split(" ", 2)[2] + # query range line + elif line.startswith("Query range:"): + # line is always 'Query range: \d+ -> \d+', so we can pluck + # the numbers directly + hsp["query_start"], hsp["query_end"] = line.split(" ", 4)[2:5:2] + # hit range line + elif line.startswith("Target range:"): + # same logic with query range + hsp["hit_start"], hsp["hit_end"] = line.split(" ", 4)[2:5:2] + + # determine strand + if qresult["description"].endswith(":[revcomp]"): + hsp["query_strand"] = "-" + qresult["description"] = qresult["description"].replace(":[revcomp]", "") + elif "protein2" in qresult["model"]: + hsp["query_strand"] = "." + else: + hsp["query_strand"] = "+" + if hit["description"].endswith(":[revcomp]"): + hsp["hit_strand"] = "-" + hit["description"] = hit["description"].replace(":[revcomp]", "") + elif "2protein" in qresult["model"]: + hsp["hit_strand"] = "." + else: + hsp["hit_strand"] = "+" + + # NOTE: we haven't processed the coordinates types + # and the strands are not yet Biopython's standard (1 / -1 / 0) + # since it's easier if we do the conversion later + + return {"qresult": qresult, "hit": hit, "hsp": hsp} + + def _parse_qresult(self): + # state values + state_EOF = 0 + state_QRES_NEW = 1 + state_QRES_SAME = 3 + state_HIT_NEW = 2 + state_HIT_SAME = 4 + # initial dummies + qres_state, hit_state = None, None + file_state = None + cur_qid, cur_hid = None, None + prev_qid, prev_hid = None, None + cur, prev = None, None + hit_list, hsp_list = [], [] + # if the file has c4 alignments, use that as the alignment mark + if self.has_c4_alignment: + self._ALN_MARK = "C4 Alignment:" + + while True: + self.read_until(lambda line: line.startswith(self._ALN_MARK)) + if cur is not None: + prev = cur + prev_qid = cur_qid + prev_hid = cur_hid + # only parse the result row if it's not EOF + if self.line: + assert self.line.startswith(self._ALN_MARK), self.line + # create temp dicts for storing parsed values + header = {"qresult": {}, "hit": {}, "hsp": {}} + # if the file has c4 alignments, try to parse the header + if self.has_c4_alignment: + self.read_until(lambda line: line.strip().startswith("Query:")) + header = self._parse_alignment_header() + # parse the block contents + cur = self.parse_alignment_block(header) + cur_qid = cur["qresult"]["id"] + cur_hid = cur["hit"]["id"] + elif not self.line or self.line.startswith("-- completed "): + file_state = state_EOF + cur_qid, cur_hid = None, None + + # get the state of hit and qresult + if prev_qid != cur_qid: + qres_state = state_QRES_NEW + else: + qres_state = state_QRES_SAME + # new hits are hits with different ids or hits in a new query + if prev_hid != cur_hid or qres_state == state_QRES_NEW: + hit_state = state_HIT_NEW + else: + hit_state = state_HIT_SAME + + if prev is not None: + hsp = _create_hsp(prev_hid, prev_qid, prev["hsp"]) + hsp_list.append(hsp) + + if hit_state == state_HIT_NEW: + hit = Hit(hsp_list) + for attr, value in prev["hit"].items(): + setattr(hit, attr, value) + hit_list.append(hit) + hsp_list = [] + + if qres_state == state_QRES_NEW or file_state == state_EOF: + qresult = QueryResult(id=prev_qid) + for hit in hit_list: + # not using append since Exonerate may separate the + # same hit if it has different strands + qresult.absorb(hit) + for attr, value in prev["qresult"].items(): + setattr(qresult, attr, value) + yield qresult + if file_state == state_EOF: + break + hit_list = [] + + # only readline() here if we're not parsing C4 alignments + # C4 alignments readline() is handled by its parse_alignment_block + # function + if not self.has_c4_alignment: + self.line = self.handle.readline() + + +class _BaseExonerateIndexer(SearchIndexer): + """Indexer class for Exonerate plain text.""" + + _parser = None # should be defined by subclass + _query_mark = None # this one too + + def get_qresult_id(self, pos): + raise NotImplementedError("Should be defined by subclass") + + def __iter__(self): + """Iterate over the file handle; yields key, start offset, and length.""" + handle = self._handle + handle.seek(0) + qresult_key = None + + while True: + start_offset = handle.tell() + line = handle.readline() + if line.startswith(self._query_mark): + if qresult_key is None: + qresult_key = self.get_qresult_id(start_offset) + qresult_offset = start_offset + else: + curr_key = self.get_qresult_id(start_offset) + if curr_key != qresult_key: + yield qresult_key, qresult_offset, start_offset - qresult_offset + qresult_key = curr_key + qresult_offset = start_offset + handle.seek(qresult_offset) + elif not line: + yield qresult_key, qresult_offset, start_offset - qresult_offset + break + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/ExonerateIO/exonerate_cigar.py b/code/lib/Bio/SearchIO/ExonerateIO/exonerate_cigar.py new file mode 100644 index 0000000..7ba8a08 --- /dev/null +++ b/code/lib/Bio/SearchIO/ExonerateIO/exonerate_cigar.py @@ -0,0 +1,109 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for Exonerate cigar output format.""" + +import re + +from ._base import _BaseExonerateParser, _STRAND_MAP +from .exonerate_vulgar import ExonerateVulgarIndexer + + +__all__ = ("ExonerateCigarParser", "ExonerateCigarIndexer") + + +# precompile regex +_RE_CIGAR = re.compile( + r"""^cigar:\s+ + (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+ # query: ID, start, end, strand + (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+ # hit: ID, start, end, strand + (\d+)(\s+.*)$ # score, vulgar components + """, + re.VERBOSE, +) + + +class ExonerateCigarParser(_BaseExonerateParser): + """Parser for Exonerate cigar strings.""" + + _ALN_MARK = "cigar" + + def parse_alignment_block(self, header): + """Parse alignment block for cigar format, return query results, hits, hsps.""" + qresult = header["qresult"] + hit = header["hit"] + hsp = header["hsp"] + self.read_until(lambda line: line.startswith("cigar")) + cigars = re.search(_RE_CIGAR, self.line) + # if the file has c4 alignments + # check if cigar values match our previously parsed header values + if self.has_c4_alignment: + assert qresult["id"] == cigars.group(1) + assert hsp["query_start"] == cigars.group(2) + assert hsp["query_end"] == cigars.group(3) + assert hsp["query_strand"] == cigars.group(4) + assert hit["id"] == cigars.group(5) + assert hsp["hit_start"] == cigars.group(6) + assert hsp["hit_end"] == cigars.group(7) + assert hsp["hit_strand"] == cigars.group(8) + assert hsp["score"] == cigars.group(9) + else: + qresult["id"] = cigars.group(1) + hsp["query_start"] = cigars.group(2) + hsp["query_end"] = cigars.group(3) + hsp["query_strand"] = cigars.group(4) + hit["id"] = cigars.group(5) + hsp["hit_start"] = cigars.group(6) + hsp["hit_end"] = cigars.group(7) + hsp["hit_strand"] = cigars.group(8) + hsp["score"] = cigars.group(9) + + # adjust strands + hsp["query_strand"] = _STRAND_MAP[hsp["query_strand"]] + hsp["hit_strand"] = _STRAND_MAP[hsp["hit_strand"]] + # cast coords into ints + qstart = int(hsp["query_start"]) + qend = int(hsp["query_end"]) + hstart = int(hsp["hit_start"]) + hend = int(hsp["hit_end"]) + # set coords (start <= end) + hsp["query_start"] = min(qstart, qend) + hsp["query_end"] = max(qstart, qend) + hsp["hit_start"] = min(hstart, hend) + hsp["hit_end"] = max(hstart, hend) + # cast score into int + hsp["score"] = int(hsp["score"]) + # store cigar components + hsp["cigar_comp"] = cigars.group(10) + # HACK: since we can't really figure out exactly when a + # HSP starts or ends, we set the entire alignment as one HSP + hsp["query_ranges"] = [(hsp["query_start"], hsp["query_end"])] + hsp["hit_ranges"] = [(hsp["hit_start"], hsp["hit_end"])] + + return {"qresult": qresult, "hit": hit, "hsp": hsp} + + +class ExonerateCigarIndexer(ExonerateVulgarIndexer): + """Indexer class for exonerate cigar lines.""" + + _parser = ExonerateCigarParser + _query_mark = b"cigar" + + def get_qresult_id(self, pos): + """Return the query ID of the nearest cigar line.""" + handle = self._handle + handle.seek(pos) + # get line, check if it's a vulgar line, and get query ID + line = handle.readline() + assert line.startswith(self._query_mark), line + id = re.search(_RE_CIGAR, line.decode()) + return id.group(1) + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/ExonerateIO/exonerate_text.py b/code/lib/Bio/SearchIO/ExonerateIO/exonerate_text.py new file mode 100644 index 0000000..b53e1e4 --- /dev/null +++ b/code/lib/Bio/SearchIO/ExonerateIO/exonerate_text.py @@ -0,0 +1,540 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for Exonerate plain text output format.""" + +import re +from itertools import chain + + +from ._base import ( + _BaseExonerateParser, + _BaseExonerateIndexer, + _STRAND_MAP, + _parse_hit_or_query_line, +) +from .exonerate_vulgar import _RE_VULGAR + + +__all__ = ("ExonerateTextParser", "ExonerateTextIndexer") + + +# for capturing sequences in alignment blocks +# e.g. ' 529 : ATCCCTTATCTCTTTATCTTGTA : 472' +_RE_ALN_ROW = re.compile(r"\s*\d+\s+: (.*) :\s+\d+") +# for splitting the line based on intron annotations +# e.g. ' >>>> Target Intron 1 >>>> ' or 'gt.........................ag' +_RE_EXON = re.compile( + r"[atgc ]{2,}?(?:(?:[<>]+ \w+ Intron \d+ [<>]+)|(?:\.+))[atgc ]{2,}?" +) +# captures the intron length +# from e.g. '61 bp // 154295 bp' (joint intron lengths) or '177446 bp' +_RE_EXON_LEN = re.compile(r"(?:(\d+) bp // (\d+) bp)|(?:(\d+) bp)") +# for splitting lines in the NER model +_RE_NER = re.compile(r"--<\s+\d+\s+>--") +# for capturing NER gap lengths +_RE_NER_LEN = re.compile(r"--<\s+(\d+)\s+>--") +# regexes for capturing the letters inside curly braces +# no. of letters is either 1 or 2, since they are split codons +_RE_SCODON_START = re.compile(r"\{(\w{1,2})\}$") +_RE_SCODON_END = re.compile(r"^\{(\w{1,2})\}") + + +def _flip_codons(codon_seq, target_seq): + """Flips the codon characters from one seq to another (PRIVATE).""" + a, b = "", "" + for char1, char2 in zip(codon_seq, target_seq): + # no need to do anything if the codon seq line has nothing + if char1 == " ": + a += char1 + b += char2 + else: + a += char2 + b += char1 + + return a, b + + +def _get_block_coords(parsed_seq, row_dict, has_ner=False): + """Return a list of start, end coordinates for each given block in the sequence (PRIVATE).""" + start = 0 + coords = [] + if not has_ner: + splitter = _RE_EXON + else: + splitter = _RE_NER + + # use the query line for reference + seq = parsed_seq[row_dict["query"]] + + for block in re.split(splitter, seq): + start += seq[start:].find(block) + end = start + len(block) + coords.append((start, end)) + + return coords + + +def _get_inter_coords(coords, strand=1): + """Return list of pairs covering intervening ranges (PRIVATE). + + From the given pairs of coordinates, returns a list of pairs + covering the intervening ranges. + """ + # adapted from Python's itertools guide + # if strand is -1, adjust coords to the ends and starts are chained + if strand == -1: + sorted_coords = [(max(a, b), min(a, b)) for a, b in coords] + inter_coords = list(chain(*sorted_coords))[1:-1] + return list(zip(inter_coords[1::2], inter_coords[::2])) + else: + inter_coords = list(chain(*coords))[1:-1] + return list(zip(inter_coords[::2], inter_coords[1::2])) + + +def _stitch_rows(raw_rows): + """Stitches together the parsed alignment rows and returns them in a list (PRIVATE).""" + # deal with possible codon surprise! + # (i.e. alignments with codons using cdna2genome model) + # by creating additional rows to contain the codons + try: + max_len = max(len(x) for x in raw_rows) + for row in raw_rows: + assert len(row) == max_len + except AssertionError: + for idx, row in enumerate(raw_rows): + if len(row) != max_len: + # codons must be present in the query and hit (so +2) + assert len(row) + 2 == max_len + # add additional empty lines to contain codons + raw_rows[idx] = [" " * len(row[0])] + row + [" " * len(row[0])] + + cmbn_rows = [] + for idx, row in enumerate(raw_rows[0]): + cmbn_row = "".join(aln_row[idx] for aln_row in raw_rows) + cmbn_rows.append(cmbn_row) + + # the real aligned sequence is always the 'outer' one, so we want + # to flip them with their 'inner' pairs + if len(cmbn_rows) == 5: + # flip query sequence + cmbn_rows[0], cmbn_rows[1] = _flip_codons(cmbn_rows[0], cmbn_rows[1]) + # flip hit sequence + cmbn_rows[4], cmbn_rows[3] = _flip_codons(cmbn_rows[4], cmbn_rows[3]) + + return cmbn_rows + + +def _get_row_dict(row_len, model): + """Return a dictionary of row indices for parsing alignment blocks (PRIVATE).""" + idx = {} + # 3 lines, usually in dna vs dna models + if row_len == 3: + idx["query"] = 0 + idx["midline"] = 1 + idx["hit"] = 2 + idx["qannot"], idx["hannot"] = None, None + # 4 lines, in protein vs dna models or dna vs protein models + # TODO: currently we check this from the model string; is there + # a better way to do it? + elif row_len == 4: + if "protein2" in model: + idx["query"] = 0 + idx["midline"] = 1 + idx["hit"] = 2 + idx["hannot"] = 3 + idx["qannot"] = None + elif "2protein" in model: + idx["query"] = 1 + idx["midline"] = 2 + idx["hit"] = 3 + idx["hannot"] = None + idx["qannot"] = 0 + else: + raise ValueError("Unexpected model: " + model) + # 5 lines, translated dna vs translated dna + elif row_len == 5: + # set sequence indexes + idx["qannot"] = 0 + idx["query"] = 1 + idx["midline"] = 2 + idx["hit"] = 3 + idx["hannot"] = 4 + else: + raise ValueError("Unexpected row count in alignment block: %i" % row_len) + return idx + + +def _get_blocks(rows, coords, idx): + """Return a list of dictionaries of sequences split by the coordinates (PRIVATE).""" + for idx_name in ("query", "hit", "midline", "qannot", "hannot"): + assert idx_name in idx + blocks = [] + for start, end in coords: + block = {} + # get seqs according to index + block["query"] = rows[idx["query"]][start:end] + block["hit"] = rows[idx["hit"]][start:end] + block["similarity"] = rows[idx["midline"]][start:end] + if idx["qannot"] is not None: + block["query_annotation"] = rows[idx["qannot"]][start:end] + if idx["hannot"] is not None: + block["hit_annotation"] = rows[idx["hannot"]][start:end] + blocks.append(block) + + return blocks + + +def _get_scodon_moves(tmp_seq_blocks): + """Get a dictionary of split codon locations relative to each fragment end (PRIVATE).""" + scodon_moves = {"query": [], "hit": []} + for seq_type in scodon_moves: + scoords = [] + for block in tmp_seq_blocks: + # check both ends of the sequence for residues in curly braces + m_start = re.search(_RE_SCODON_START, block[seq_type]) + m_end = re.search(_RE_SCODON_END, block[seq_type]) + if m_start: + m_start = len(m_start.group(1)) + scoords.append((m_start, 0)) + else: + scoords.append((0, 0)) + if m_end: + m_end = len(m_end.group(1)) + scoords.append((0, m_end)) + else: + scoords.append((0, 0)) + scodon_moves[seq_type] = scoords + + return scodon_moves + + +def _clean_blocks(tmp_seq_blocks): + """Remove curly braces (split codon markers) from the given sequences (PRIVATE).""" + seq_blocks = [] + for seq_block in tmp_seq_blocks: + for line_name in seq_block: + seq_block[line_name] = ( + seq_block[line_name].replace("{", "").replace("}", "") + ) + seq_blocks.append(seq_block) + + return seq_blocks + + +def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens): + """Return the length of introns between fragments (PRIVATE).""" + # set opposite type, for setting introns + opp_type = "hit" if seq_type == "query" else "query" + # list of flags to denote if an intron follows a block + # it reads e.g. this line: + # "ATGTT{TT} >>>> Target Intron 1 >>>> {G}TGTGTGTACATT" + # and sets the opposing sequence type's intron (since this + # line is present on the opposite sequence type line) + has_intron_after = ["Intron" in x[seq_type] for x in inter_blocks] + assert len(has_intron_after) == len(raw_inter_lens) + # create list containing coord adjustments incorporating + # intron lengths + inter_lens = [] + for flag, parsed_len in zip(has_intron_after, raw_inter_lens): + if flag: + # joint introns + if all(parsed_len[:2]): + # intron len is [0] if opp_type is query, otherwise it's [1] + intron_len = ( + int(parsed_len[0]) if opp_type == "query" else int(parsed_len[1]) + ) + # single hit/query introns + elif parsed_len[2]: + intron_len = int(parsed_len[2]) + else: + raise ValueError("Unexpected intron parsing result: %r" % parsed_len) + else: + intron_len = 0 + + inter_lens.append(intron_len) + + return inter_lens + + +def _comp_coords(hsp, seq_type, inter_lens): + """Fill the block coordinates of the given hsp dictionary (PRIVATE).""" + assert seq_type in ("hit", "query") + # manually fill the first coord + seq_step = 1 if hsp["%s_strand" % seq_type] >= 0 else -1 + fstart = hsp["%s_start" % seq_type] + # fend is fstart + number of residues in the sequence, minus gaps + fend = ( + fstart + + len(hsp[seq_type][0].replace("-", "").replace(">", "").replace("<", "")) + * seq_step + ) + coords = [(fstart, fend)] + # and start from the second block, after the first inter seq + for idx, block in enumerate(hsp[seq_type][1:]): + bstart = coords[-1][1] + inter_lens[idx] * seq_step + bend = bstart + seq_step * len(block.replace("-", "")) + coords.append((bstart, bend)) + + # adjust the coords so the smallest is [0], if strand is -1 + # couldn't do this in the previous steps since we need the initial + # block ordering + if seq_step != 1: + for idx, coord in enumerate(coords): + coords[idx] = coords[idx][1], coords[idx][0] + + return coords + + +def _comp_split_codons(hsp, seq_type, scodon_moves): + """Compute positions of split codons, store in given HSP dictionary (PRIVATE).""" + scodons = [] + for idx in range(len(scodon_moves[seq_type])): + pair = scodon_moves[seq_type][idx] + if not any(pair): + continue + else: + assert not all(pair) + a, b = pair + anchor_pair = hsp["%s_ranges" % seq_type][idx // 2] + strand = 1 if hsp["%s_strand" % seq_type] >= 0 else -1 + + if a: + func = max if strand == 1 else min + anchor = func(anchor_pair) + start_c, end_c = anchor + a * strand * -1, anchor + elif b: + func = min if strand == 1 else max + anchor = func(anchor_pair) + start_c, end_c = anchor + b * strand, anchor + scodons.append((min(start_c, end_c), max(start_c, end_c))) + + return scodons + + +class ExonerateTextParser(_BaseExonerateParser): + """Parser for Exonerate plain text output.""" + + _ALN_MARK = "C4 Alignment:" + + def parse_alignment_block(self, header): + """Parse alignment block, return query result, hits, hsps.""" + qresult = header["qresult"] + hit = header["hit"] + hsp = header["hsp"] + # check for values that must have been set by previous methods + for val_name in ( + "query_start", + "query_end", + "hit_start", + "hit_end", + "query_strand", + "hit_strand", + ): + assert val_name in hsp, hsp + + # get the alignment rows + # and stitch them so we have the full sequences in single strings + raw_aln_blocks, vulgar_comp = self._read_alignment() + # cmbn_rows still has split codon markers (curly braces) + cmbn_rows = _stitch_rows(raw_aln_blocks) + row_dict = _get_row_dict(len(cmbn_rows), qresult["model"]) + # get the sequence blocks + has_ner = "NER" in qresult["model"].upper() + seq_coords = _get_block_coords(cmbn_rows, row_dict, has_ner) + tmp_seq_blocks = _get_blocks(cmbn_rows, seq_coords, row_dict) + # get split codon temp coords for later use + # this result in pairs of base movement for both ends of each row + scodon_moves = _get_scodon_moves(tmp_seq_blocks) + # remove the split codon markers + seq_blocks = _clean_blocks(tmp_seq_blocks) + + # adjust strands + hsp["query_strand"] = _STRAND_MAP[hsp["query_strand"]] + hsp["hit_strand"] = _STRAND_MAP[hsp["hit_strand"]] + # cast coords into ints + hsp["query_start"] = int(hsp["query_start"]) + hsp["query_end"] = int(hsp["query_end"]) + hsp["hit_start"] = int(hsp["hit_start"]) + hsp["hit_end"] = int(hsp["hit_end"]) + # cast score into ints + hsp["score"] = int(hsp["score"]) + # set sequences + hsp["query"] = [x["query"] for x in seq_blocks] + hsp["hit"] = [x["hit"] for x in seq_blocks] + hsp["aln_annotation"] = {} + # set the molecule type + # currently only limited to models with protein queries + if ( + "protein2" in qresult["model"] + or "coding2" in qresult["model"] + or "2protein" in qresult["model"] + ): + hsp["molecule_type"] = "protein" + # get the annotations if they exist + for annot_type in ("similarity", "query_annotation", "hit_annotation"): + try: + hsp["aln_annotation"][annot_type] = [x[annot_type] for x in seq_blocks] + except KeyError: + pass + + # use vulgar coordinates if vulgar line is present and return + # if vulgar_comp is not None: + # hsp = parse_vulgar_comp(hsp, vulgar_comp) + + # return {'qresult': qresult, 'hit': hit, 'hsp': hsp} + + # otherwise we need to get the coordinates from the alignment + # get the intervening blocks first, so we can use them + # to adjust the coordinates + if not has_ner: + # get intervening coordinates and blocks, only if model is not ner + # ner models have a much more simple coordinate calculation + inter_coords = _get_inter_coords(seq_coords) + inter_blocks = _get_blocks(cmbn_rows, inter_coords, row_dict) + # returns a three-component tuple of intron lengths + # first two component filled == intron in hit and query + # last component filled == intron in hit or query + raw_inter_lens = re.findall(_RE_EXON_LEN, cmbn_rows[row_dict["midline"]]) + + # compute start and end coords for each block + for seq_type in ("query", "hit"): + + # ner blocks and intron blocks require different adjustments + if not has_ner: + opp_type = "hit" if seq_type == "query" else "query" + inter_lens = _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens) + else: + # for NER blocks, the length of the inter-fragment gaps is + # written on the same strand, so opp_type is seq_type + opp_type = seq_type + inter_lens = [ + int(x) + for x in re.findall(_RE_NER_LEN, cmbn_rows[row_dict[seq_type]]) + ] + + # check that inter_lens's length is len opp_type block - 1 + if len(inter_lens) != len(hsp[opp_type]) - 1: + raise ValueError( + "Length mismatch: %r vs %r" + % (len(inter_lens), len(hsp[opp_type]) - 1) + ) + # fill the hsp query and hit coordinates + hsp["%s_ranges" % opp_type] = _comp_coords(hsp, opp_type, inter_lens) + # and fill the split codon coordinates, if model != ner + # can't do this in the if-else clause above since we need to + # compute the ranges first + if not has_ner: + hsp["%s_split_codons" % opp_type] = _comp_split_codons( + hsp, opp_type, scodon_moves + ) + + # now that we've finished parsing coords, we can set the hit and start + # coord according to Biopython's convention (start <= end) + for seq_type in ("query", "hit"): + if hsp["%s_strand" % seq_type] == -1: + n_start = "%s_start" % seq_type + n_end = "%s_end" % seq_type + hsp[n_start], hsp[n_end] = hsp[n_end], hsp[n_start] + + return {"qresult": qresult, "hit": hit, "hsp": hsp} + + def _read_alignment(self): + """Read the raw alignment block strings, returns them in a list (PRIVATE).""" + raw_aln_blocks = [] + # flag to check whether we're in an alignment row + in_aln_row = False + # flag for vulgar line, if present, we can parse coordinates from it + vulgar_comp = None + while True: + + match = re.search(_RE_ALN_ROW, self.line.strip()) + # if we have a match, set flags and values + if match and not in_aln_row: + start_idx = self.line.index(match.group(1)) + row_len = len(match.group(1)) + in_aln_row = True + raw_aln_block = [] + # if we're in an alignment row, grab the sequence + if in_aln_row: + raw_aln_block.append(self.line[start_idx : start_idx + row_len]) + # reset flags and values if the line matches, we're in an alignment + # row, and there are more than 1 line in rows + if match and in_aln_row and len(raw_aln_block) > 1: + raw_aln_blocks.append(raw_aln_block) + start_idx = None + row_len = None + in_aln_row = False + + self.line = self.handle.readline() + # try to parse vulgar line if present + if self.line.startswith("vulgar"): + vulgar = re.search(_RE_VULGAR, self.line) + vulgar_comp = vulgar.group(10) + if not self.line or self.line.startswith(self._ALN_MARK): + # HACK: this is so that the parse_qresult method does not + # yield the objects before appending the last HSP. We are doing + # this to keep the parser compatible with outputs without + # human-readable alignment outputs. This also relies on the + # fact that repeated readline() always returns '' on EOF. + if not self.line: + self.line = "mock" + break + + return raw_aln_blocks, vulgar_comp + + +class ExonerateTextIndexer(_BaseExonerateIndexer): + """Indexer class for Exonerate plain text.""" + + _parser = ExonerateTextParser + _query_mark = b"C4 Alignment" + + def get_qresult_id(self, pos): + """Return the query ID from the nearest "Query:" line.""" + handle = self._handle + handle.seek(pos) + sentinel = b"Query:" + + while True: + line = handle.readline().strip() + if line.startswith(sentinel): + break + if not line: + raise StopIteration + qid, desc = _parse_hit_or_query_line(line.decode()) + + return qid + + def get_raw(self, offset): + """Return the raw string of a QueryResult object from the given offset.""" + handle = self._handle + handle.seek(offset) + qresult_key = None + qresult_raw = b"" + + while True: + line = handle.readline() + if not line: + break + elif line.startswith(self._query_mark): + cur_pos = handle.tell() + if qresult_key is None: + qresult_key = self.get_qresult_id(cur_pos) + else: + curr_key = self.get_qresult_id(cur_pos) + if curr_key != qresult_key: + break + handle.seek(cur_pos) + qresult_raw += line + + return qresult_raw + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/ExonerateIO/exonerate_vulgar.py b/code/lib/Bio/SearchIO/ExonerateIO/exonerate_vulgar.py new file mode 100644 index 0000000..ce342d3 --- /dev/null +++ b/code/lib/Bio/SearchIO/ExonerateIO/exonerate_vulgar.py @@ -0,0 +1,219 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for Exonerate vulgar output format.""" + +import re + +from ._base import _BaseExonerateParser, _BaseExonerateIndexer, _STRAND_MAP + + +__all__ = ("ExonerateVulgarParser", "ExonerateVulgarIndexer") + + +# precompile regex +_RE_VULGAR = re.compile( + r"""^vulgar:\s+ + (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+ # query: ID, start, end, strand + (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+ # hit: ID, start, end, strand + (\d+)(\s+.*)$ # score, vulgar components + """, + re.VERBOSE, +) + +_RE_VCOMP = re.compile( + r""" + \s+(\S+) # vulgar label (C/M: codon/match, G: gap, N: ner, 5/3: splice + # site, I: intron, S: split codon, F: frameshift) + \s+(\d+) # how many residues to advance in query sequence + \s+(\d+) # how many residues to advance in hit sequence + """, + re.VERBOSE, +) + + +def parse_vulgar_comp(hsp, vulgar_comp): + """Parse the vulgar components present in the hsp dictionary.""" + # containers for block coordinates + qstarts = [hsp["query_start"]] + qends = [] + hstarts = [hsp["hit_start"]] + hends = [] + # containers for split codons + hsp["query_split_codons"] = [] + hsp["hit_split_codons"] = [] + # containers for ner blocks + hsp["query_ner_ranges"] = [] + hsp["hit_ner_ranges"] = [] + # sentinels for tracking query and hit positions + qpos = hsp["query_start"] + hpos = hsp["hit_start"] + # multiplier for determining sentinel movement + qmove = 1 if hsp["query_strand"] >= 0 else -1 + hmove = 1 if hsp["hit_strand"] >= 0 else -1 + + vcomps = re.findall(_RE_VCOMP, vulgar_comp) + for idx, match in enumerate(vcomps): + label, qstep, hstep = match[0], int(match[1]), int(match[2]) + # check for label, must be recognized + assert label in "MCGF53INS", "Unexpected vulgar label: %r" % label + # match, codon, or gaps + if label in "MCGS": + # if the previous comp is not an MCGS block, it's the + # start of a new block + if vcomps[idx - 1][0] not in "MCGS": + qstarts.append(qpos) + hstarts.append(hpos) + # other labels + # store the values in the hsp dict as a tuple of (start, stop) + # we're not doing anything if the label is in '53IN', as these + # basically tell us what the inter-block coordinates are and + # inter-block coordinates are automatically calculated by + # and HSP property + if label == "S": + # get start and stop from parsed values + qstart, hstart = qpos, hpos + qend = qstart + qstep * qmove + hend = hstart + hstep * hmove + # adjust the start-stop ranges + sqstart, sqend = min(qstart, qend), max(qstart, qend) + shstart, shend = min(hstart, hend), max(hstart, hend) + # split codons + # XXX: is it possible to have a frameshift that introduces + # a codon split? If so, this may need a different treatment.. + hsp["query_split_codons"].append((sqstart, sqend)) + hsp["hit_split_codons"].append((shstart, shend)) + + # move sentinels accordingly + qpos += qstep * qmove + hpos += hstep * hmove + + # append to ends if the next comp is not an MCGS block or + # if it's the last comp + if idx == len(vcomps) - 1 or ( + label in "MCGS" and vcomps[idx + 1][0] not in "MCGS" + ): + qends.append(qpos) + hends.append(hpos) + + # adjust coordinates + for seq_type in ("query_", "hit_"): + strand = hsp[seq_type + "strand"] + # switch coordinates if strand is < 0 + if strand < 0: + # switch the starts and ends + hsp[seq_type + "start"], hsp[seq_type + "end"] = ( + hsp[seq_type + "end"], + hsp[seq_type + "start"], + ) + if seq_type == "query_": + qstarts, qends = qends, qstarts + else: + hstarts, hends = hends, hstarts + + # set start and end ranges + hsp["query_ranges"] = list(zip(qstarts, qends)) + hsp["hit_ranges"] = list(zip(hstarts, hends)) + return hsp + + +class ExonerateVulgarParser(_BaseExonerateParser): + """Parser for Exonerate vulgar strings.""" + + _ALN_MARK = "vulgar" + + def parse_alignment_block(self, header): + """Parse alignment block for vulgar format, return query results, hits, hsps.""" + qresult = header["qresult"] + hit = header["hit"] + hsp = header["hsp"] + self.read_until(lambda line: line.startswith("vulgar")) + vulgars = re.search(_RE_VULGAR, self.line) + # if the file has c4 alignments + # check if vulgar values match our previously parsed header values + if self.has_c4_alignment: + assert qresult["id"] == vulgars.group(1) + assert hsp["query_start"] == vulgars.group(2) + assert hsp["query_end"] == vulgars.group(3) + assert hsp["query_strand"] == vulgars.group(4) + assert hit["id"] == vulgars.group(5) + assert hsp["hit_start"] == vulgars.group(6) + assert hsp["hit_end"] == vulgars.group(7) + assert hsp["hit_strand"] == vulgars.group(8) + assert hsp["score"] == vulgars.group(9) + else: + qresult["id"] = vulgars.group(1) + hsp["query_start"] = vulgars.group(2) + hsp["query_end"] = vulgars.group(3) + hsp["query_strand"] = vulgars.group(4) + hit["id"] = vulgars.group(5) + hsp["hit_start"] = vulgars.group(6) + hsp["hit_end"] = vulgars.group(7) + hsp["hit_strand"] = vulgars.group(8) + hsp["score"] = vulgars.group(9) + + # adjust strands + hsp["hit_strand"] = _STRAND_MAP[hsp["hit_strand"]] + hsp["query_strand"] = _STRAND_MAP[hsp["query_strand"]] + # cast coords into ints + hsp["query_start"] = int(hsp["query_start"]) + hsp["query_end"] = int(hsp["query_end"]) + hsp["hit_start"] = int(hsp["hit_start"]) + hsp["hit_end"] = int(hsp["hit_end"]) + # cast score into int + hsp["score"] = int(hsp["score"]) + # store vulgar line and parse it + # rstrip to remove line endings (otherwise gives errors in Windows) + hsp["vulgar_comp"] = vulgars.group(10).rstrip() + hsp = parse_vulgar_comp(hsp, hsp["vulgar_comp"]) + + return {"qresult": qresult, "hit": hit, "hsp": hsp} + + +class ExonerateVulgarIndexer(_BaseExonerateIndexer): + """Indexer class for exonerate vulgar lines.""" + + _parser = ExonerateVulgarParser + _query_mark = b"vulgar" + + def get_qresult_id(self, pos): + """Return the query ID of the nearest vulgar line.""" + handle = self._handle + handle.seek(pos) + # get line, check if it's a vulgar line, and get query ID + line = handle.readline() + assert line.startswith(self._query_mark), line + id = re.search(_RE_VULGAR, line.decode()) + return id.group(1) + + def get_raw(self, offset): + """Return the raw bytes string of a QueryResult object from the given offset.""" + handle = self._handle + handle.seek(offset) + qresult_key = None + qresult_raw = b"" + + while True: + line = handle.readline() + if not line: + break + elif line.startswith(self._query_mark): + cur_pos = handle.tell() - len(line) + if qresult_key is None: + qresult_key = self.get_qresult_id(cur_pos) + else: + curr_key = self.get_qresult_id(cur_pos) + if curr_key != qresult_key: + break + qresult_raw += line + + return qresult_raw + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/FastaIO.py b/code/lib/Bio/SearchIO/FastaIO.py new file mode 100644 index 0000000..8f6c227 --- /dev/null +++ b/code/lib/Bio/SearchIO/FastaIO.py @@ -0,0 +1,601 @@ +# Adapted from Bio.AlignIO.FastaIO copyright 2008-2011 by Peter Cock. +# Copyright 2012 by Wibowo Arindrarto. +# All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +r"""Bio.SearchIO support for Bill Pearson's FASTA tools. + +This module adds support for parsing FASTA outputs. FASTA is a suite of +programs that finds regions of local or global similarity between protein +or nucleotide sequences, either by searching databases or identifying +local duplications. + +Bio.SearchIO.FastaIO was tested on the following FASTA flavors and versions: + + - flavors: fasta, ssearch, tfastx + - versions: 35, 36 + +Other flavors and/or versions may introduce some bugs. Please file a bug report +if you see such problems to Biopython's bug tracker. + +More information on FASTA are available through these links: + + - Website: http://fasta.bioch.virginia.edu/fasta_www2/fasta_list2.shtml + - User guide: http://fasta.bioch.virginia.edu/fasta_www2/fasta_guide.pdf + + +Supported Formats +================= + +Bio.SearchIO.FastaIO supports parsing and indexing FASTA outputs triggered by +the -m 10 flag. Other formats that mimic other programs (e.g. the BLAST tabular +format using the -m 8 flag) may be parseable but using SearchIO's other parsers +(in this case, using the 'blast-tab' parser). + + +fasta-m10 +========= + +Note that in FASTA -m 10 outputs, HSPs from different strands are considered to +be from different hits. They are listed as two separate entries in the hit +table. FastaIO recognizes this and will group HSPs with the same hit ID into a +single Hit object, regardless of strand. + +FASTA also sometimes output extra sequences adjacent to the HSP match. These +extra sequences are discarded by FastaIO. Only regions containing the actual +sequence match are extracted. + +The following object attributes are provided: + ++-----------------+-------------------------+----------------------------------+ +| Object | Attribute | Value | ++=================+=========================+==================================+ +| QueryResult | description | query sequence description | +| +-------------------------+----------------------------------+ +| | id | query sequence ID | +| +-------------------------+----------------------------------+ +| | program | FASTA flavor | +| +-------------------------+----------------------------------+ +| | seq_len | full length of query sequence | +| +-------------------------+----------------------------------+ +| | target | target search database | +| +-------------------------+----------------------------------+ +| | version | FASTA version | ++-----------------+-------------------------+----------------------------------+ +| Hit | seq_len | full length of the hit sequence | ++-----------------+-------------------------+----------------------------------+ +| HSP | bitscore | \*_bits line | +| +-------------------------+----------------------------------+ +| | evalue | \*_expect line | +| +-------------------------+----------------------------------+ +| | ident_pct | \*_ident line | +| +-------------------------+----------------------------------+ +| | init1_score | \*_init1 line | +| +-------------------------+----------------------------------+ +| | initn_score | \*_initn line | +| +-------------------------+----------------------------------+ +| | opt_score | \*_opt line, \*_s-w opt line | +| +-------------------------+----------------------------------+ +| | pos_pct | \*_sim line | +| +-------------------------+----------------------------------+ +| | sw_score | \*_score line | +| +-------------------------+----------------------------------+ +| | z_score | \*_z-score line | ++-----------------+-------------------------+----------------------------------+ +| HSPFragment | aln_annotation | al_cons block, if present | +| (also via HSP) +-------------------------+----------------------------------+ +| | hit | hit sequence | +| +-------------------------+----------------------------------+ +| | hit_end | hit sequence end coordinate | +| +-------------------------+----------------------------------+ +| | hit_start | hit sequence start coordinate | +| +-------------------------+----------------------------------+ +| | hit_strand | hit sequence strand | +| +-------------------------+----------------------------------+ +| | query | query sequence | +| +-------------------------+----------------------------------+ +| | query_end | query sequence end coordinate | +| +-------------------------+----------------------------------+ +| | query_start | query sequence start coordinate | +| +-------------------------+----------------------------------+ +| | query_strand | query sequence strand | ++-----------------+-------------------------+----------------------------------+ + +""" + +import re + +from Bio.SearchIO._index import SearchIndexer +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment + + +__all__ = ("FastaM10Parser", "FastaM10Indexer") + + +# precompile regex patterns +# regex for program name +_RE_FLAVS = re.compile(r"t?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search") +# regex for sequence ID and length ~ deals with both \n and \r\n +_PTR_ID_DESC_SEQLEN = r">>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$" +_RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN) +_RE_ID_DESC_SEQLEN_IDX = re.compile(_PTR_ID_DESC_SEQLEN.encode()) +# regex for qresult, hit, or hsp attribute value +_RE_ATTR = re.compile(r"^; [a-z]+(_[ \w-]+):\s+(.*)$") +# regex for capturing excess start and end sequences in alignments +_RE_START_EXC = re.compile(r"^-*") +_RE_END_EXC = re.compile(r"-*$") + +# attribute name mappings +_HSP_ATTR_MAP = { + "_initn": ("initn_score", int), + "_init1": ("init1_score", int), + "_opt": ("opt_score", int), + "_s-w opt": ("opt_score", int), + "_z-score": ("z_score", float), + "_bits": ("bitscore", float), + "_expect": ("evalue", float), + "_score": ("sw_score", int), + "_ident": ("ident_pct", float), + "_sim": ("pos_pct", float), +} + +# state flags +_STATE_NONE = 0 +_STATE_QUERY_BLOCK = 1 +_STATE_HIT_BLOCK = 2 +_STATE_CONS_BLOCK = 3 + + +def _set_qresult_hits(qresult, hit_rows=()): + """Append Hits without alignments into QueryResults (PRIVATE).""" + for hit_row in hit_rows: + hit_id, remainder = hit_row.split(" ", 1) + # TODO: parse hit and hsp properties properly; by dealing with: + # - any character in the description (brackets, spaces, etc.) + # - possible [f] or [r] presence (for frame info) + # - possible presence of E2() column + # - possible incomplete hit_id due to column length limit + # The current method only looks at the Hit ID, none of the things above + if hit_id not in qresult: + frag = HSPFragment(hit_id, qresult.id) + hsp = HSP([frag]) + hit = Hit([hsp]) + qresult.append(hit) + + return qresult + + +def _set_hsp_seqs(hsp, parsed, program): + """Set HSPs sequences (PRIVATE). + + :param hsp: HSP whose properties will be set + :type hsp: HSP + :param parsed: parsed values of the HSP attributes + :type parsed: dictionary {string: object} + :param program: program name + :type program: string + + """ + # get aligned sequences and check if they have equal lengths + start = 0 + for seq_type in ("hit", "query"): + if "tfast" not in program: + pseq = parsed[seq_type] + # adjust start and end coordinates based on the amount of + # filler characters + start, stop = _get_aln_slice_coords(pseq) + start_adj = len(re.search(_RE_START_EXC, pseq["seq"]).group(0)) + stop_adj = len(re.search(_RE_END_EXC, pseq["seq"]).group(0)) + start = start + start_adj + stop = stop + start_adj - stop_adj + parsed[seq_type]["seq"] = pseq["seq"][start:stop] + if len(parsed["query"]["seq"]) != len(parsed["hit"]["seq"]): + raise ValueError( + "Length mismatch: %r %r" + % (len(parsed["query"]["seq"]), len(parsed["hit"]["seq"])) + ) + if "similarity" in hsp.aln_annotation: + # only using 'start' since FASTA seems to have trimmed the 'excess' + # end part + hsp.aln_annotation["similarity"] = hsp.aln_annotation["similarity"][start:] + # hit or query works equally well here + assert len(hsp.aln_annotation["similarity"]) == len(parsed["hit"]["seq"]) + + # query and hit sequence types must be the same + assert parsed["query"]["_type"] == parsed["hit"]["_type"] + type_val = parsed["query"]["_type"] # hit works fine too + molecule_type = "DNA" if type_val == "D" else "protein" + setattr(hsp.fragment, "molecule_type", molecule_type) + + for seq_type in ("hit", "query"): + # get and set start and end coordinates + start = int(parsed[seq_type]["_start"]) + end = int(parsed[seq_type]["_stop"]) + + setattr(hsp.fragment, seq_type + "_start", min(start, end) - 1) + setattr(hsp.fragment, seq_type + "_end", max(start, end)) + # set seq and molecule type + setattr(hsp.fragment, seq_type, parsed[seq_type]["seq"]) + + if molecule_type != "protein": + # get strand from coordinate; start <= end is plus + # start > end is minus + if start <= end: + setattr(hsp.fragment, seq_type + "_strand", 1) + else: + setattr(hsp.fragment, seq_type + "_strand", -1) + else: + setattr(hsp.fragment, seq_type + "_strand", 0) + + +def _get_aln_slice_coords(parsed_hsp): + """Get HSPs sequences (PRIVATE). + + To get the actual pairwise alignment sequences, we must first + translate the un-gapped sequence based coordinates into positions + in the gapped sequence (which may have a flanking region shown + using leading - characters). To date, I have never seen any + trailing flanking region shown in the m10 file, but the + following code should also cope with that. + + Note that this code seems to work fine even when the "sq_offset" + entries are present as a result of using the -X command line option. + """ + seq = parsed_hsp["seq"] + seq_stripped = seq.strip("-") + disp_start = int(parsed_hsp["_display_start"]) + start = int(parsed_hsp["_start"]) + stop = int(parsed_hsp["_stop"]) + + if start <= stop: + start = start - disp_start + stop = stop - disp_start + 1 + else: + start = disp_start - start + stop = disp_start - stop + 1 + stop += seq_stripped.count("-") + if not (0 <= start and start < stop and stop <= len(seq_stripped)): + raise ValueError( + "Problem with sequence start/stop,\n%s[%i:%i]\n%s" + % (seq, start, stop, parsed_hsp) + ) + return start, stop + + +class FastaM10Parser: + """Parser for Bill Pearson's FASTA suite's -m 10 output.""" + + def __init__(self, handle, __parse_hit_table=False): + """Initialize the class.""" + self.handle = handle + self._preamble = self._parse_preamble() + + def __iter__(self): + """Iterate over FastaM10Parser object yields query results.""" + for qresult in self._parse_qresult(): + # re-set desc, for hsp query description + qresult.description = qresult.description + yield qresult + + def _parse_preamble(self): + """Parse the Fasta preamble for Fasta flavor and version (PRIVATE).""" + preamble = {} + while True: + line = self.handle.readline() + # this should be the line just before the first qresult + if line.startswith("Query"): + break + # try to match for version line + elif line.startswith(" version"): + preamble["version"] = line.split(" ")[2] + else: + # try to match for flavor line + flav_match = re.match(_RE_FLAVS, line.lower()) + if flav_match: + preamble["program"] = flav_match.group(0) + self.line = line + + return preamble + + def __parse_hit_table(self): + """Parse hit table rows.""" + # parse hit table until we see an empty line + hit_rows = [] + while True: + line = self.handle.readline() + if (not line) or line.strip(): + break + hit_rows.append("") + self.line = line + return hit_rows + + def _parse_qresult(self): + """Parse query result (PRIVATE).""" + # initial qresult value + qresult = None + hit_rows = [] + # state values + state_QRES_NEW = 1 + state_QRES_HITTAB = 3 + state_QRES_CONTENT = 5 + state_QRES_END = 7 + + line = self.line + + while True: + + # one line before the hit table + if line.startswith("The best scores are:"): + qres_state = state_QRES_HITTAB + # the end of a query or the file altogether + elif line.strip() == ">>>///" or not line: + qres_state = state_QRES_END + # the beginning of a new query + elif not line.startswith(">>>") and ">>>" in line: + qres_state = state_QRES_NEW + # the beginning of the query info and its hits + hsps + elif line.startswith(">>>") and not line.strip() == ">>><<<": + qres_state = state_QRES_CONTENT + # default qres mark + else: + qres_state = None + + if qres_state is not None: + if qres_state == state_QRES_HITTAB: + # parse hit table if flag is set + hit_rows = self.__parse_hit_table() + line = self.handle.readline() + + elif qres_state == state_QRES_END: + yield _set_qresult_hits(qresult, hit_rows) + break + + elif qres_state == state_QRES_NEW: + # if qresult is filled, yield it first + if qresult is not None: + yield _set_qresult_hits(qresult, hit_rows) + regx = re.search(_RE_ID_DESC_SEQLEN, line) + query_id = regx.group(1) + seq_len = regx.group(3) + desc = regx.group(2) + qresult = QueryResult(id=query_id) + qresult.seq_len = int(seq_len) + # get target from the next line + line = self.handle.readline() + qresult.target = [x for x in line.split(" ") if x][1].strip() + if desc is not None: + qresult.description = desc + # set values from preamble + for key, value in self._preamble.items(): + setattr(qresult, key, value) + line = self.handle.readline() + + elif qres_state == state_QRES_CONTENT: + assert line[3:].startswith(qresult.id), line + for hit, strand in self._parse_hit(query_id): + # HACK: re-set desc, for hsp hit and query description + hit.description = hit.description + hit.query_description = qresult.description + # if hit is not in qresult, append it + if hit.id not in qresult: + qresult.append(hit) + # otherwise, it might be the same hit with a different strand + else: + # make sure strand is different and then append hsp to + # existing hit + for hsp in hit.hsps: + assert strand != hsp.query_strand + qresult[hit.id].append(hsp) + line = self.line + + else: + line = self.handle.readline() + + self.line = line + + def _parse_hit(self, query_id): + """Parse hit on query identifier (PRIVATE).""" + while True: + line = self.handle.readline() + if line.startswith(">>"): + break + + state = _STATE_NONE + strand = None + hsp_list = [] + hsp = None + parsed_hsp = None + hit_desc = None + seq_len = None + while True: + # yield hit if we've reached the start of a new query or + # the end of the search + self.line = self.handle.readline() + if self.line.strip() in [">>><<<", ">>>///"] or ( + not self.line.startswith(">>>") and ">>>" in self.line + ): + # append last parsed_hsp['hit']['seq'] line + if state == _STATE_HIT_BLOCK: + parsed_hsp["hit"]["seq"] += line.strip() + elif state == _STATE_CONS_BLOCK: + hsp.aln_annotation["similarity"] += line.strip("\r\n") + # process HSP alignment and coordinates + _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) + hit = Hit(hsp_list) + hit.description = hit_desc + hit.seq_len = seq_len + yield hit, strand + hsp_list = [] + break + # yield hit and create a new one if we're still in the same query + elif line.startswith(">>"): + # try yielding, if we have hsps + if hsp_list: + _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) + hit = Hit(hsp_list) + hit.description = hit_desc + hit.seq_len = seq_len + yield hit, strand + hsp_list = [] + # try to get the hit id and desc, and handle cases without descs + try: + hit_id, hit_desc = line[2:].strip().split(" ", 1) + except ValueError: + hit_id = line[2:].strip().split(" ", 1)[0] + hit_desc = "" + # create the HSP object for Hit + frag = HSPFragment(hit_id, query_id) + hsp = HSP([frag]) + hsp_list.append(hsp) + # set or reset the state to none + state = _STATE_NONE + parsed_hsp = {"query": {}, "hit": {}} + # create and append a new HSP if line starts with '>--' + elif line.startswith(">--"): + # set seq attributes of previous hsp + _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) + # and create a new one + frag = HSPFragment(hit_id, query_id) + hsp = HSP([frag]) + hsp_list.append(hsp) + # set the state ~ none yet + state = _STATE_NONE + parsed_hsp = {"query": {}, "hit": {}} + # this is either query or hit data in the HSP, depending on the state + elif line.startswith(">"): + if state == _STATE_NONE: + # make sure it's the correct query + if not query_id.startswith(line[1:].split(" ")[0]): + raise ValueError("%r vs %r" % (query_id, line)) + state = _STATE_QUERY_BLOCK + parsed_hsp["query"]["seq"] = "" + elif state == _STATE_QUERY_BLOCK: + # make sure it's the correct hit + assert hit_id.startswith(line[1:].split(" ")[0]) + state = _STATE_HIT_BLOCK + parsed_hsp["hit"]["seq"] = "" + # check for conservation block + elif line.startswith("; al_cons"): + state = _STATE_CONS_BLOCK + hsp.fragment.aln_annotation["similarity"] = "" + elif line.startswith(";"): + # Fasta outputs do not make a clear distinction between Hit + # and HSPs, so we check the attribute names to determine + # whether it belongs to a Hit or HSP + regx = re.search(_RE_ATTR, line.strip()) + name = regx.group(1) + value = regx.group(2) + + # for values before the '>...' query block + if state == _STATE_NONE: + if name in _HSP_ATTR_MAP: + attr_name, caster = _HSP_ATTR_MAP[name] + if caster is not str: + value = caster(value) + if name in ["_ident", "_sim"]: + value *= 100 + setattr(hsp, attr_name, value) + # otherwise, pool the values for processing later + elif state == _STATE_QUERY_BLOCK: + parsed_hsp["query"][name] = value + elif state == _STATE_HIT_BLOCK: + if name == "_len": + seq_len = int(value) + else: + parsed_hsp["hit"][name] = value + # for values in the hit block + else: + raise ValueError("Unexpected line: %r" % line) + # otherwise, it must be lines containing the sequences + else: + assert ">" not in line + # if we're in hit, parse into hsp.hit + if state == _STATE_HIT_BLOCK: + parsed_hsp["hit"]["seq"] += line.strip() + elif state == _STATE_QUERY_BLOCK: + parsed_hsp["query"]["seq"] += line.strip() + elif state == _STATE_CONS_BLOCK: + hsp.fragment.aln_annotation["similarity"] += line.strip("\r\n") + # we should not get here! + else: + raise ValueError("Unexpected line: %r" % line) + line = self.line + + +class FastaM10Indexer(SearchIndexer): + """Indexer class for Bill Pearson's FASTA suite's -m 10 output.""" + + _parser = FastaM10Parser + + def __init__(self, filename): + """Initialize the class.""" + SearchIndexer.__init__(self, filename) + + def __iter__(self): + """Iterate over FastaM10Indexer; yields query results' keys, start offsets, offset lengths.""" + handle = self._handle + handle.seek(0) + start_offset = handle.tell() + qresult_key = None + query_mark = b">>>" + + line = handle.readline() + while True: + end_offset = handle.tell() + + if not line.startswith(query_mark) and query_mark in line: + regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line) + qresult_key = regx.group(1).decode() + start_offset = end_offset - len(line) + # yield whenever we encounter a new query or at the end of the file + if qresult_key is not None: + if not line: + yield qresult_key, start_offset, end_offset - start_offset + break + line = handle.readline() + if not line.startswith(query_mark) and query_mark in line: + yield qresult_key, start_offset, end_offset - start_offset + start_offset = end_offset + else: + line = handle.readline() + + def get_raw(self, offset): + """Return the raw record from the file as a bytes string.""" + handle = self._handle + qresult_raw = b"" + query_mark = b">>>" + + # read header first + handle.seek(0) + line = handle.readline() + while True: + qresult_raw += line + line = handle.readline() + if not line.startswith(query_mark) and query_mark in line: + break + + # and read the qresult raw string + handle.seek(offset) + line = handle.readline() + while True: + # preserve whitespace, don't use read_forward + if not line: + break + qresult_raw += line + + line = handle.readline() + # break when we've reached qresult end + if not line.startswith(query_mark) and query_mark in line: + break + + # append mock end marker to qresult_raw, since it's not always present + return qresult_raw + b">>><<<\n" + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/HHsuiteIO/__init__.py b/code/lib/Bio/SearchIO/HHsuiteIO/__init__.py new file mode 100644 index 0000000..faf2ce3 --- /dev/null +++ b/code/lib/Bio/SearchIO/HHsuiteIO/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2019 by Jens Thomas. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO support for HHSUITE output formats. + +This module adds support for parsing HHSUITE version 2 output. + +More information about HHSUITE are available through these links: +- Github repository: https://github.com/soedinglab/hh-suite +- Wiki: https://github.com/soedinglab/hh-suite/wiki + +""" + +from .hhsuite2_text import Hhsuite2TextParser diff --git a/code/lib/Bio/SearchIO/HHsuiteIO/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SearchIO/HHsuiteIO/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..77e9d8c Binary files /dev/null and b/code/lib/Bio/SearchIO/HHsuiteIO/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/HHsuiteIO/__pycache__/hhsuite2_text.cpython-37.pyc b/code/lib/Bio/SearchIO/HHsuiteIO/__pycache__/hhsuite2_text.cpython-37.pyc new file mode 100644 index 0000000..8effd2d Binary files /dev/null and b/code/lib/Bio/SearchIO/HHsuiteIO/__pycache__/hhsuite2_text.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/HHsuiteIO/hhsuite2_text.py b/code/lib/Bio/SearchIO/HHsuiteIO/hhsuite2_text.py new file mode 100644 index 0000000..2335620 --- /dev/null +++ b/code/lib/Bio/SearchIO/HHsuiteIO/hhsuite2_text.py @@ -0,0 +1,234 @@ +# Copyright 2019 by Jens Thomas. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for HHSUITE version 2 and 3 plain text output format.""" + +import re +from collections import OrderedDict +import warnings + +from Bio.SearchIO._utils import read_forward +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment + +__all__ = ("Hhsuite2TextParser",) + +# precompile regex patterns for faster processing +# regex for query name capture +_RE_QUERY = re.compile(r"^Query\s+(.+)\s?$") + +# regex for version string capture +_RE_HIT_BLOCK_START = re.compile(r"^No +(\d+)\s+$") + +# id and full description +_RE_HIT_BLOCK_DESC = re.compile(r">(\S+)\s+(.*)$") + +# sequence alignment line +# Q sp|Q9BSU1|CP07 229 DAKMRVFERSVYFGDSCQDVLSMLGSPHKV 258 (422) +_RE_MATCH_BLOCK_QUERY_SEQ = re.compile(r"^Q\s+(.+) +(\d+) +([A-Z-]+) +(\d+) +\(\d+\)$") +_RE_MATCH_BLOCK_HIT_SEQ = re.compile(r"^T\s+(.+) +(\d+) +([A-Z-]+) +(\d+) +\(\d+\)$") + +_END_OF_FILE_MARKER = "Done!" + +_PROGRAM = "HHSUITE" + +# Maximum number of lines to read before expecting a hit block +# This determines the maximum numnber of hits that would be allowed in +# the initial hit table. +MAX_READ_UNTIL = 5000 + + +class Hhsuite2TextParser: + """Parser for the HHSUITE version 2 and 3 text output.""" + + def __init__(self, handle): + """Initialize the class.""" + self.handle = handle + self.line = read_forward(self.handle) + self.done = False + self.query_id = None + self.seq_len = None + + def __iter__(self): + """Iterate over query results - there will only ever be one.""" + yield from self._parse_qresult() + + def _read_until(self, bool_func, stop_on_blank=True, max_read_until=MAX_READ_UNTIL): + """Read the file handle until the given function returns True (PRIVATE).""" + count = 0 + while True: + if stop_on_blank and not self.line: + return + if bool_func(self.line): + return + else: + self.line = read_forward(self.handle) + count += 1 + if count >= max_read_until: + raise RuntimeError("Exceeded max_read_until in _read_until") + + def _parse_qresult(self): + """Parse HHSUITE output file (PRIVATE).""" + hit_block_data = [] + self._parse_preamble() + self._read_until( + lambda line: re.search(_RE_HIT_BLOCK_START, line), stop_on_blank=False + ) + while not self.done: + hit_dict = self._parse_hit_block() + hit_block_data.append(hit_dict) + return self._create_qresult(hit_block_data) + + def _parse_preamble(self): + """Parse metadata about query (PRIVATE).""" + meta = {} + while self.line: + regx = re.search(_RE_QUERY, self.line) + if regx: + self.query_id = regx.group(1) + if self.line.startswith("Match_columns"): + self.seq_len = int(self.line.strip().split()[1]) + self.line = self.handle.readline().strip() + return meta + + def _parse_hit_block(self): + """Parse a hit block (PRIVATE).""" + self.line = read_forward(self.handle) + match = re.search(_RE_HIT_BLOCK_DESC, self.line) + if not match: + raise RuntimeError( + f"Unexpected content in HIT_BLOCK_DESC line'{self.line}'" + ) + hit_data = { + "hit_id": match.group(1), + "description": match.group(2).lstrip(" ;"), + "evalue": None, + "hit_start": None, + "hit_end": None, + "hit_seq": "", + "prob": None, + "query_start": None, + "query_end": None, + "query_seq": "", + "score": None, + } + self.line = self.handle.readline() + self._process_score_line(self.line, hit_data) + while True: + self.line = read_forward(self.handle) + if not self.line.strip() or self.line.startswith(_END_OF_FILE_MARKER): + # _END_OF_FILE_MARKER isn't always present + self.done = True + return hit_data + elif re.search(_RE_HIT_BLOCK_START, self.line): + return hit_data + else: + self._parse_hit_match_block(hit_data) + + @staticmethod + def _process_score_line(line, hit_data): + """Parse the scores from the line and populate hit_data dict (PRIVATE). + + Lines are of the form: + Probab=99.95 E-value=3.7e-34 Score=210.31 Aligned_cols=171 Identities=100% Similarity=2.050 Sum_probs=166.9 + + E-value could be in decimal or scientific notation, so split the string rather then use regexp - this + also means we should be tolerant of additional fields being added/removed + """ + score_map = {"E-value": "evalue", "Score": "score", "Probab": "prob"} + for score_pair in line.strip().split(): + key, value = score_pair.split("=") + if key in score_map: + try: + hit_data[score_map[key]] = float(value) + except KeyError: + # We trigger warnings here as it's not a big enough problem to crash, but indicates something unexpected. + warnings.warn( + f"HHsuite parser: unable to extract {key} from line: {line}" + ) + + def _parse_hit_match_block(self, hit_match_data): + """Parse a single block of hit sequence data (PRIVATE). + + Parses block such as :: + + Q ss_pred ceecchHHHHHHHHHHHHHHHHHHHhhhhhcCCCCccc + Q 4P79:A|PDBID|C 160 YELGPALYLGWSASLLSILGGICVFSTAAASSKEEPAT 197 (198) + Q Consensus 160 ~~~g~sf~l~~~~~~l~~~~~~l~~~~~~~~~~~~~~~ 197 (198) + .++|||||++|++.++.+++++++++..+..++++..+ + T Consensus 327 ~~~GwS~~l~~~s~~l~lia~~l~~~~~~~~~~~~~~~ 364 (364) + T 5B2G_A 327 REMGASLYVGWAASGLLLLGGGLLCCSGPSSGENLYFQ 364 (364) + T ss_dssp EEECTHHHHHHHHHHHHHHHHHHHHCC----------- + T ss_pred cccchHHHHHHHHHHHHHHHHHHHHhcCCCCCCccccC + + """ + + def match_is_valid(match): + """Return True if match is not a Consensus column (PRIVATE). + + It's not possible to distinguish a sequence line from a Consensus line with + a regexp, so need to check the ID column. + """ + return match.group(1).strip() != "Consensus" + + while True: + if not self.line.strip(): # blank lines indicate the end of a hit block + return + match = re.match(_RE_MATCH_BLOCK_QUERY_SEQ, self.line) + if match and match_is_valid(match): + hit_match_data["query_seq"] += match.group(3).strip() + if hit_match_data["query_start"] is None: + hit_match_data["query_start"] = int(match.group(2)) + hit_match_data["query_end"] = int(match.group(4)) + else: + match = re.match(_RE_MATCH_BLOCK_HIT_SEQ, self.line) + if match and match_is_valid(match): + hit_match_data["hit_seq"] += match.group(3).strip() + if hit_match_data["hit_start"] is None: + hit_match_data["hit_start"] = int(match.group(2)) + hit_match_data["hit_end"] = int(match.group(4)) + self.line = self.handle.readline() + + def _create_qresult(self, hit_blocks): + """Create the Biopython data structures from the parsed data (PRIVATE).""" + query_id = self.query_id + hit_dict = OrderedDict() + + for output_index, block in enumerate(hit_blocks): + hit_id = block["hit_id"] + + frag = HSPFragment(hit_id, query_id) + frag.molecule_type = "protein" + frag.query_start = block["query_start"] - 1 + frag.query_end = block["query_end"] + frag.hit_start = block["hit_start"] - 1 + frag.hit_end = block["hit_end"] + frag.hit = block["hit_seq"] + frag.query = block["query_seq"] + + hsp = HSP([frag]) + hsp.hit_id = hit_id + hsp.output_index = output_index + hsp.query_id = query_id + hsp.hit_description = block["description"] + is_included = True # Should everything should be included? + hsp.is_included = is_included + hsp.evalue = block["evalue"] + hsp.score = block["score"] + hsp.prob = block["prob"] + + if hit_id not in hit_dict: + hit = Hit([hsp], hit_id) + hit.description = block["description"] + hit.is_included = is_included + hit.evalue = block["evalue"] + hit.score = block["score"] + hit_dict[hit_id] = hit + else: + hit_dict[hit_id].append(hsp) + + qresult = QueryResult(hit_dict.values(), query_id) + qresult.program = _PROGRAM + qresult.seq_len = self.seq_len + return [qresult] diff --git a/code/lib/Bio/SearchIO/HmmerIO/__init__.py b/code/lib/Bio/SearchIO/HmmerIO/__init__.py new file mode 100644 index 0000000..c243007 --- /dev/null +++ b/code/lib/Bio/SearchIO/HmmerIO/__init__.py @@ -0,0 +1,304 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO support for HMMER output formats. + +This module adds support for parsing HMMER outputs. HMMER is a +suite of programs implementing the profile hidden Markov models to find +similarity across protein sequences. + +Bio.SearchIO.HmmerIO was tested on the following HMMER versions and flavors: + + - HMMER3 flavors: hmmscan, hmmsearch, phmmer + - HMMER2 flavors: hmmpfam, hmmsearch + +More information on HMMER are available through these links: + - Web page: http://hmmer.janelia.org/ + - User guide: ftp://selab.janelia.org/pub/software/hmmer3/3.0/Userguide.pdf + + +Supported formats +================= + +Bio.SearchIO.HmmerIO supports the following HMMER output formats: + + - Plain text, v3.0 - 'hmmer3-text' - parsing, indexing + - Table, v3.0 - 'hmmer3-tab' - parsing, indexing, writing + - Domain table, v3.0 - 'hmmer3-domtab'* - parsing, indexing, writing + - Plain text, v2.x - 'hmmer2-text' - parsing, indexing + +* For the domain table output, due to the way HMMER outputs the sequence + coordinates, you have to specify what HMMER flavor produced the output as the + file format. So instead of using 'hmmer3-domtab', you have to use either + 'hmmscan3-domtab', 'hmmsearch3-domtab', or 'phmmer3-domtab' as the file format + name. + +Note that for all output formats, HMMER uses its own convention of input and +output coordinates. It does not use the term 'hit' or 'query', instead it +uses 'hmm' or 'ali'. For example, 'hmmfrom' is the start coordinate of the HMM +sequence while 'alifrom' is the start coordinate of the protein sequence. + +HmmerIO is aware of this different naming scheme and will adjust them +accordingly to fit SearchIO's object model. If HmmerIO sees that the output file +to parse was written by hmmsearch or phmmer, all 'hmm' coordinates will be the +hit coordinates and 'ali' coordinates will be the query coordinates. Conversely, +if the HMMER flavor is hmmscan, 'hmm' will be query and 'ali' will be hit. + +This is why the 'hmmer3-domtab' format has to be specified with the source HMMER +flavor. The parsers need to know which is the hit and which is the query. +'hmmer3-text' has its source program information present in the file, while +'hmmer3-tab' does not output any coordinates. That's why both of these formats +do not need direct flavor specification like 'hmmer3-domtab'. + +Also note that when using the domain table format writers, it will use HMMER's +naming convention ('hmm' and 'ali') so the files you write will be similar to +files written by a real HMMER program. + + +hmmer2-text and hmmer3-text +=========================== + +The parser for HMMER 3.0 plain text output can parse output files with alignment +blocks (default) or without (with the '--noali' flag). If the alignment blocks +are present, you can also parse files with variable alignment width (using the +'--notextw' or '--textw' flag). + +The following SearchIO objects attributes are provided. Rows marked with '*' +denotes attributes not available in the hmmer2-text format: + ++-----------------+-------------------------+----------------------------------+ +| Object | Attribute | Value | ++=================+=========================+==================================+ +| QueryResult | accession | accession (if present) | +| +-------------------------+----------------------------------+ +| | description | query sequence description | +| +-------------------------+----------------------------------+ +| | id | query sequence ID | +| +-------------------------+----------------------------------+ +| | program | HMMER flavor | +| +-------------------------+----------------------------------+ +| | seq_len* | full length of query sequence | +| +-------------------------+----------------------------------+ +| | target | target search database | +| +-------------------------+----------------------------------+ +| | version | BLAST version | ++-----------------+-------------------------+----------------------------------+ +| Hit | bias* | hit-level bias | +| +-------------------------+----------------------------------+ +| | bitscore | hit-level score | +| +-------------------------+----------------------------------+ +| | description | hit sequence description | +| +-------------------------+----------------------------------+ +| | domain_exp_num* | expected number of domains in | +| | | the hit (exp column) | +| +-------------------------+----------------------------------+ +| | domain_obs_num | observed number of domains in | +| | | the hit (N column) | +| +-------------------------+----------------------------------+ +| | evalue | hit-level e-value | +| +-------------------------+----------------------------------+ +| | id | hit sequence ID | +| +-------------------------+----------------------------------+ +| | is_included* | boolean, whether the hit is in | +| | | the inclusion threshold or not | ++-----------------+-------------------------+----------------------------------+ +| HSP | acc_avg* | expected accuracy per alignment | +| | | residue (acc column) | +| +-------------------------+----------------------------------+ +| | bias* | hsp-level bias | +| +-------------------------+----------------------------------+ +| | bitscore | hsp-level score | +| +-------------------------+----------------------------------+ +| | domain_index | the domain index set by HMMER | +| +-------------------------+----------------------------------+ +| | env_end* | end coordinate of the envelope | +| +-------------------------+----------------------------------+ +| | env_endtype* | envelope end types (e.g. '[]', | +| | | '..', '[.', etc.) | +| +-------------------------+----------------------------------+ +| | env_start* | start coordinate of the envelope | +| +-------------------------+----------------------------------+ +| | evalue | hsp-level independent e-value | +| +-------------------------+----------------------------------+ +| | evalue_cond* | hsp-level conditional e-value | +| +-------------------------+----------------------------------+ +| | hit_endtype | hit sequence end types | +| +-------------------------+----------------------------------+ +| | is_included* | boolean, whether the hit of the | +| | | hsp is in the inclusion | +| | | threshold | +| +-------------------------+----------------------------------+ +| | query_endtype | query sequence end types | ++-----------------+-------------------------+----------------------------------+ +| HSPFragment | aln_annotation | alignment similarity string and | +| (also via HSP) | | other annotations (e.g. PP, CS) | +| +-------------------------+----------------------------------+ +| | aln_span | length of alignment fragment | +| +-------------------------+----------------------------------+ +| | hit | hit sequence | +| +-------------------------+----------------------------------+ +| | hit_end | hit sequence end coordinate, may | +| | | be 'hmmto' or 'alito' depending | +| | | on the HMMER flavor | +| +-------------------------+----------------------------------+ +| | hit_start | hit sequence start coordinate, | +| | | may be 'hmmfrom' or 'alifrom' | +| | | depending on the HMMER flavor | +| +-------------------------+----------------------------------+ +| | hit_strand | hit sequence strand | +| +-------------------------+----------------------------------+ +| | query | query sequence | +| +-------------------------+----------------------------------+ +| | query_end | query sequence end coordinate, | +| | | may be 'hmmto' or 'alito' | +| | | depending on the HMMER flavor | +| +-------------------------+----------------------------------+ +| | query_start | query sequence start coordinate, | +| | | may be 'hmmfrom' or 'alifrom' | +| | | depending on the HMMER flavor | +| +-------------------------+----------------------------------+ +| | query_strand | query sequence strand | ++-----------------+-------------------------+----------------------------------+ + + +hmmer3-tab +========== +The following SearchIO objects attributes are provided: + ++-----------------+-------------------------+----------------------------------+ +| Object | Attribute | Column / Value | ++=================+=========================+==================================+ +| QueryResult | accession | query accession (if present) | +| +-------------------------+----------------------------------+ +| | description | query sequence description | +| +-------------------------+----------------------------------+ +| | id | query name | ++-----------------+-------------------------+----------------------------------+ +| Hit | accession | hit accession | +| +-------------------------+----------------------------------+ +| | bias | hit-level bias | +| +-------------------------+----------------------------------+ +| | bitscore | hit-level score | +| +-------------------------+----------------------------------+ +| | description | hit sequence description | +| +-------------------------+----------------------------------+ +| | cluster_num | clu column | +| +-------------------------+----------------------------------+ +| | domain_exp_num | exp column | +| +-------------------------+----------------------------------+ +| | domain_included_num | inc column | +| +-------------------------+----------------------------------+ +| | domain_obs_num | dom column | +| +-------------------------+----------------------------------+ +| | domain_reported_num | rep column | +| +-------------------------+----------------------------------+ +| | env_num | env column | +| +-------------------------+----------------------------------+ +| | evalue | hit-level evalue | +| +-------------------------+----------------------------------+ +| | id | target name | +| +-------------------------+----------------------------------+ +| | overlap_num | ov column | +| +-------------------------+----------------------------------+ +| | region_num | reg column | ++-----------------+-------------------------+----------------------------------+ +| HSP | bias | bias of the best domain | +| +-------------------------+----------------------------------+ +| | bitscore | bitscore of the best domain | +| +-------------------------+----------------------------------+ +| | evalue | evalue of the best domain | ++-----------------+-------------------------+----------------------------------+ + + +hmmer3-domtab +============= +To parse domain table files, you must use the HMMER flavor that produced the +file. So instead of using 'hmmer3-domtab', use either 'hmmsearch3-domtab', +'hmmscan3-domtab', or 'phmmer3-domtab'. + +The following SearchIO objects attributes are provided: + ++-----------------+-------------------------+----------------------------------+ +| Object | Attribute | Value | ++=================+=========================+==================================+ +| QueryResult | accession | accession | +| +-------------------------+----------------------------------+ +| | description | query sequence description | +| +-------------------------+----------------------------------+ +| | id | query sequence ID | +| +-------------------------+----------------------------------+ +| | seq_len | full length of query sequence | ++-----------------+-------------------------+----------------------------------+ +| Hit | accession | accession | +| +-------------------------+----------------------------------+ +| | bias | hit-level bias | +| +-------------------------+----------------------------------+ +| | bitscore | hit-level score | +| +-------------------------+----------------------------------+ +| | description | hit sequence description | +| +-------------------------+----------------------------------+ +| | evalue | hit-level e-value | +| +-------------------------+----------------------------------+ +| | id | hit sequence ID | +| +-------------------------+----------------------------------+ +| | seq_len | length of hit sequence or HMM | ++-----------------+-------------------------+----------------------------------+ +| HSP | acc_avg | expected accuracy per alignment | +| | | residue (acc column) | +| +-------------------------+----------------------------------+ +| | bias | hsp-level bias | +| +-------------------------+----------------------------------+ +| | bitscore | hsp-level score | +| +-------------------------+----------------------------------+ +| | domain_index | the domain index set by HMMER | +| +-------------------------+----------------------------------+ +| | env_end | end coordinate of the envelope | +| +-------------------------+----------------------------------+ +| | env_start | start coordinate of the envelope | +| +-------------------------+----------------------------------+ +| | evalue | hsp-level independent e-value | +| +-------------------------+----------------------------------+ +| | evalue_cond | hsp-level conditional e-value | ++-----------------+-------------------------+----------------------------------+ +| HSPFragment | hit_end | hit sequence end coordinate, may | +| (also via HSP) | | be 'hmmto' or 'alito' depending | +| | | on the HMMER flavor | +| +-------------------------+----------------------------------+ +| | hit_start | hit sequence start coordinate, | +| | | may be 'hmmfrom' or 'alifrom' | +| | | depending on the HMMER flavor | +| +-------------------------+----------------------------------+ +| | hit_strand | hit sequence strand | +| +-------------------------+----------------------------------+ +| | query_end | query sequence end coordinate, | +| | | may be 'hmmto' or 'alito' | +| | | depending on the HMMER flavor | +| +-------------------------+----------------------------------+ +| | query_start | query sequence start coordinate, | +| | | may be 'hmmfrom' or 'alifrom' | +| | | depending on the HMMER flavor | +| +-------------------------+----------------------------------+ +| | query_strand | query sequence strand | ++-----------------+-------------------------+----------------------------------+ + +""" + +from .hmmer2_text import Hmmer2TextParser, Hmmer2TextIndexer +from .hmmer3_domtab import ( + Hmmer3DomtabParser, + Hmmer3DomtabHmmhitParser, + Hmmer3DomtabHmmqueryParser, +) +from .hmmer3_domtab import Hmmer3DomtabHmmhitIndexer, Hmmer3DomtabHmmqueryIndexer +from .hmmer3_domtab import Hmmer3DomtabHmmhitWriter, Hmmer3DomtabHmmqueryWriter +from .hmmer3_text import Hmmer3TextParser, Hmmer3TextIndexer +from .hmmer3_tab import Hmmer3TabParser, Hmmer3TabIndexer, Hmmer3TabWriter + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/HmmerIO/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..9eef1d6 Binary files /dev/null and b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/HmmerIO/__pycache__/_base.cpython-37.pyc b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/_base.cpython-37.pyc new file mode 100644 index 0000000..4c49c25 Binary files /dev/null and b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/_base.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer2_text.cpython-37.pyc b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer2_text.cpython-37.pyc new file mode 100644 index 0000000..e375cb5 Binary files /dev/null and b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer2_text.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_domtab.cpython-37.pyc b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_domtab.cpython-37.pyc new file mode 100644 index 0000000..0b4dcf2 Binary files /dev/null and b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_domtab.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_tab.cpython-37.pyc b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_tab.cpython-37.pyc new file mode 100644 index 0000000..4f988c7 Binary files /dev/null and b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_tab.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_text.cpython-37.pyc b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_text.cpython-37.pyc new file mode 100644 index 0000000..f47f107 Binary files /dev/null and b/code/lib/Bio/SearchIO/HmmerIO/__pycache__/hmmer3_text.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/HmmerIO/_base.py b/code/lib/Bio/SearchIO/HmmerIO/_base.py new file mode 100644 index 0000000..3c20ad7 --- /dev/null +++ b/code/lib/Bio/SearchIO/HmmerIO/_base.py @@ -0,0 +1,45 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO base classes for HMMER-related code.""" + +from Bio.SearchIO._index import SearchIndexer + + +class _BaseHmmerTextIndexer(SearchIndexer): + """Base indexer class for HMMER plain text output.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._preamble = b"" + + def get_raw(self, offset): + """Return the raw record from the file as a bytes string.""" + handle = self._handle + qresult_raw = b"" + + # read header first + if not self._preamble: + handle.seek(0) + while True: + line = handle.readline() + if line.startswith(self.qresult_start): + break + qresult_raw += line + else: + qresult_raw += self._preamble + + # and read the qresult raw string + handle.seek(offset) + while True: + # preserve whitespace, don't use read_forward + line = handle.readline() + qresult_raw += line + + # break when we've reached qresult end + if line.startswith(self.qresult_end) or not line: + break + + return qresult_raw diff --git a/code/lib/Bio/SearchIO/HmmerIO/hmmer2_text.py b/code/lib/Bio/SearchIO/HmmerIO/hmmer2_text.py new file mode 100644 index 0000000..f28ec22 --- /dev/null +++ b/code/lib/Bio/SearchIO/HmmerIO/hmmer2_text.py @@ -0,0 +1,374 @@ +# Copyright 2012 by Kai Blin. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for HMMER 2 text output.""" + +import re + +from Bio.SearchIO._utils import read_forward +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment + +from ._base import _BaseHmmerTextIndexer + +__all__ = ("Hmmer2TextParser", "Hmmer2TextIndexer") + + +_HSP_ALIGN_LINE = re.compile(r"(\S+):\s+domain (\d+) of (\d+)") + + +class _HitPlaceholder: + def createHit(self, hsp_list): + hit = Hit(hsp_list) + hit.id_ = self.id_ + hit.evalue = self.evalue + hit.bitscore = self.bitscore + if self.description: + hit.description = self.description + hit.domain_obs_num = self.domain_obs_num + return hit + + +class Hmmer2TextParser: + """Iterator for the HMMER 2.0 text output.""" + + def __init__(self, handle): + """Initialize the class.""" + self.handle = handle + self.buf = [] + self._meta = self.parse_preamble() + + def __iter__(self): + """Iterate over Hmmer2TextParser, yields query results.""" + for qresult in self.parse_qresult(): + qresult.program = self._meta.get("program") + qresult.target = self._meta.get("target") + qresult.version = self._meta.get("version") + yield qresult + + def read_next(self, rstrip=True): + """Return the next non-empty line, trailing whitespace removed.""" + if len(self.buf) > 0: + return self.buf.pop() + self.line = self.handle.readline() + while self.line and rstrip and not self.line.strip(): + self.line = self.handle.readline() + if self.line: + if rstrip: + self.line = self.line.rstrip() + return self.line + + def push_back(self, line): + """Un-read a line that should not be parsed yet.""" + self.buf.append(line) + + def parse_key_value(self): + """Parse key-value pair separated by colon.""" + key, value = self.line.split(":", 1) + return key.strip(), value.strip() + + def parse_preamble(self): + """Parse HMMER2 preamble.""" + meta = {} + state = "GENERIC" + while self.read_next(): + if state == "GENERIC": + if self.line.startswith("hmm"): + meta["program"] = self.line.split("-")[0].strip() + elif self.line.startswith("HMMER is"): + continue + elif self.line.startswith("HMMER"): + meta["version"] = self.line.split()[1] + elif self.line.count("-") == 36: + state = "OPTIONS" + continue + + assert state == "OPTIONS" + assert "program" in meta + + if self.line.count("-") == 32: + break + + key, value = self.parse_key_value() + if meta["program"] == "hmmsearch": + if key == "Sequence database": + meta["target"] = value + continue + elif meta["program"] == "hmmpfam": + if key == "HMM file": + meta["target"] = value + continue + meta[key] = value + + return meta + + def parse_qresult(self): + """Parse a HMMER2 query block.""" + while self.read_next(): + if not self.line.startswith("Query"): + return + _, id_ = self.parse_key_value() + self.qresult = QueryResult(id=id_) + + description = None + + while self.read_next() and not self.line.startswith("Scores"): + if self.line.startswith("Accession"): + self.qresult.accession = self.parse_key_value()[1] + if self.line.startswith("Description"): + description = self.parse_key_value()[1] + + hit_placeholders = self.parse_hits() + if len(hit_placeholders) > 0: + self.parse_hsps(hit_placeholders) + self.parse_hsp_alignments() + + while not self.line.startswith("Query"): + self.read_next() + if not self.line: + break + self.buf.append(self.line) + + if description is not None: + self.qresult.description = description + yield self.qresult + + def parse_hits(self): + """Parse a HMMER2 hit block, beginning with the hit table.""" + hit_placeholders = [] + while self.read_next(): + if self.line.startswith("Parsed"): + break + if self.line.find("no hits") > -1: + break + + if ( + self.line.startswith("Sequence") + or self.line.startswith("Model") + or self.line.startswith("-------- ") + ): + continue + + fields = self.line.split() + id_ = fields.pop(0) + domain_obs_num = int(fields.pop()) + evalue = float(fields.pop()) + bitscore = float(fields.pop()) + description = " ".join(fields).strip() + + hit = _HitPlaceholder() + hit.id_ = id_ + hit.evalue = evalue + hit.bitscore = bitscore + hit.description = description + hit.domain_obs_num = domain_obs_num + hit_placeholders.append(hit) + + return hit_placeholders + + def parse_hsps(self, hit_placeholders): + """Parse a HMMER2 hsp block, beginning with the hsp table.""" + # HSPs may occur in different order than the hits + # so store Hit objects separately first + unordered_hits = {} + while self.read_next(): + if ( + self.line.startswith("Alignments") + or self.line.startswith("Histogram") + or self.line == "//" + ): + break + if ( + self.line.startswith("Model") + or self.line.startswith("Sequence") + or self.line.startswith("--------") + ): + continue + + ( + id_, + domain, + seq_f, + seq_t, + seq_compl, + hmm_f, + hmm_t, + hmm_compl, + score, + evalue, + ) = self.line.split() + + frag = HSPFragment(id_, self.qresult.id) + frag.molecule_type = "protein" + if self._meta["program"] == "hmmpfam": + frag.hit_start = int(hmm_f) - 1 + frag.hit_end = int(hmm_t) + frag.query_start = int(seq_f) - 1 + frag.query_end = int(seq_t) + elif self._meta["program"] == "hmmsearch": + frag.query_start = int(hmm_f) - 1 + frag.query_end = int(hmm_t) + frag.hit_start = int(seq_f) - 1 + frag.hit_end = int(seq_t) + + hsp = HSP([frag]) + hsp.evalue = float(evalue) + hsp.bitscore = float(score) + hsp.domain_index = int(domain.split("/")[0]) + if self._meta["program"] == "hmmpfam": + hsp.hit_endtype = hmm_compl + hsp.query_endtype = seq_compl + elif self._meta["program"] == "hmmsearch": + hsp.query_endtype = hmm_compl + hsp.hit_endtype = seq_compl + + if id_ not in unordered_hits: + placeholder = [p for p in hit_placeholders if p.id_ == id_][0] + hit = placeholder.createHit([hsp]) + unordered_hits[id_] = hit + else: + hit = unordered_hits[id_] + hsp.hit_description = hit.description + hit.append(hsp) + + # The placeholder list is in the correct order, so use that order for + # the Hit objects in the qresult + for p in hit_placeholders: + self.qresult.append(unordered_hits[p.id_]) + + def parse_hsp_alignments(self): + """Parse a HMMER2 HSP alignment block.""" + if not self.line.startswith("Alignments"): + return + + while self.read_next(): + if self.line == "//" or self.line.startswith("Histogram"): + break + + match = re.search(_HSP_ALIGN_LINE, self.line) + if match is None: + continue + + id_ = match.group(1) + idx = int(match.group(2)) + num = int(match.group(3)) + + hit = self.qresult[id_] + if hit.domain_obs_num != num: + continue + + frag = hit[idx - 1][0] + + hmmseq = "" + consensus = "" + otherseq = "" + structureseq = "" + pad = 0 + while self.read_next() and self.line.startswith(" "): + # if there's structure information, parse that + if self.line[16:18] == "CS": + structureseq += self.line[19:].strip() + + if not self.read_next(): + break + + # skip the *-> start marker if it exists + if self.line[19:22] == "*->": + seq = self.line[22:] + pad = 3 + else: + seq = self.line[19:] + pad = 0 + + hmmseq += seq + line_len = len(seq) + if not self.read_next(rstrip=False): + break + consensus += self.line[19 + pad : 19 + pad + line_len] + # If there's no consensus sequence, hmmer2 doesn't + # bother to put spaces here, so add extra padding + extra_padding = len(hmmseq) - len(consensus) + consensus += " " * extra_padding + + if not self.read_next(): + break + + # if we have a line break in the end marker, we get a + # whitespace-only otherseq line, making split()[0] return + # the end coordinate. That'll be a -, which is a valid character + # in the sequence, meaning we can't just strip it. + parts = self.line[19:].split() + if len(parts) == 2: + otherseq += self.line[19:].split()[0].strip() + + self.push_back(self.line) + + # get rid of the end marker + if hmmseq.endswith("<-*"): + hmmseq = hmmseq[:-3] + consensus = consensus[:-3] + + # add similarity sequence to annotation + frag.aln_annotation["similarity"] = consensus + + # if there's structure information, add it to the fragment + if structureseq: + frag.aln_annotation["CS"] = structureseq + + if self._meta["program"] == "hmmpfam": + frag.hit = hmmseq + frag.query = otherseq + else: + frag.hit = otherseq + frag.query = hmmseq + + +class Hmmer2TextIndexer(_BaseHmmerTextIndexer): + """Indexer for hmmer2-text format.""" + + _parser = Hmmer2TextParser + qresult_start = b"Query" + # qresults_ends for hmmpfam and hmmsearch + # need to anticipate both since hmmsearch have different query end mark + qresult_end = b"//" + + def __iter__(self): + """Iterate over Hmmer2TextIndexer; yields query results' key, offsets, 0.""" + handle = self._handle + handle.seek(0) + start_offset = handle.tell() + regex_id = re.compile(br"Query\s*(?:sequence|HMM)?:\s*(.*)") + + # determine flag for hmmsearch + is_hmmsearch = False + line = read_forward(handle) + if line.startswith(b"hmmsearch"): + is_hmmsearch = True + + while True: + end_offset = handle.tell() + + if line.startswith(self.qresult_start): + regx = re.search(regex_id, line) + qresult_key = regx.group(1).strip() + # qresult start offset is the offset of this line + # (starts with the start mark) + start_offset = end_offset - len(line) + elif line.startswith(self.qresult_end): + yield qresult_key.decode(), start_offset, 0 + start_offset = end_offset + elif not line: + # HACK: since hmmsearch can only have one query result + if is_hmmsearch: + yield qresult_key.decode(), start_offset, 0 + break + + line = read_forward(handle) + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/HmmerIO/hmmer3_domtab.py b/code/lib/Bio/SearchIO/HmmerIO/hmmer3_domtab.py new file mode 100644 index 0000000..514106a --- /dev/null +++ b/code/lib/Bio/SearchIO/HmmerIO/hmmer3_domtab.py @@ -0,0 +1,375 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for HMMER domain table output format.""" + +from itertools import chain + +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment + +from .hmmer3_tab import Hmmer3TabParser, Hmmer3TabIndexer + +__all__ = ( + "Hmmer3DomtabHmmhitParser", + "Hmmer3DomtabHmmqueryParser", + "Hmmer3DomtabHmmhitIndexer", + "Hmmer3DomtabHmmqueryIndexer", + "Hmmer3DomtabHmmhitWriter", + "Hmmer3DomtabHmmqueryWriter", +) + + +class Hmmer3DomtabParser(Hmmer3TabParser): + """Base hmmer3-domtab iterator.""" + + def _parse_row(self): + """Return a dictionary of parsed row values (PRIVATE).""" + assert self.line + cols = [x for x in self.line.strip().split(" ") if x] + # if len(cols) > 23, we have extra description columns + # combine them all into one string in the 19th column + if len(cols) > 23: + cols[22] = " ".join(cols[22:]) + elif len(cols) < 23: + cols.append("") + assert len(cols) == 23 + + # assign parsed column data into qresult, hit, and hsp dicts + qresult = {} + qresult["id"] = cols[3] # query name + qresult["accession"] = cols[4] # query accession + qresult["seq_len"] = int(cols[5]) # qlen + hit = {} + hit["id"] = cols[0] # target name + hit["accession"] = cols[1] # target accession + hit["seq_len"] = int(cols[2]) # tlen + hit["evalue"] = float(cols[6]) # evalue + hit["bitscore"] = float(cols[7]) # score + hit["bias"] = float(cols[8]) # bias + hit["description"] = cols[22] # description of target + hsp = {} + hsp["domain_index"] = int(cols[9]) # # (domain number) + # not parsing cols[10] since it's basically len(hit) + hsp["evalue_cond"] = float(cols[11]) # c-evalue + hsp["evalue"] = float(cols[12]) # i-evalue + hsp["bitscore"] = float(cols[13]) # score + hsp["bias"] = float(cols[14]) # bias + hsp["env_start"] = int(cols[19]) - 1 # env from + hsp["env_end"] = int(cols[20]) # env to + hsp["acc_avg"] = float(cols[21]) # acc + frag = {} + # strand is always 0, since HMMER now only handles protein + frag["hit_strand"] = frag["query_strand"] = 0 + frag["hit_start"] = int(cols[15]) - 1 # hmm from + frag["hit_end"] = int(cols[16]) # hmm to + frag["query_start"] = int(cols[17]) - 1 # ali from + frag["query_end"] = int(cols[18]) # ali to + # HMMER results are always protein + frag["molecule_type"] = "protein" + + # switch hmm<-->ali coordinates if hmm is not hit + if not self.hmm_as_hit: + frag["hit_end"], frag["query_end"] = (frag["query_end"], frag["hit_end"]) + frag["hit_start"], frag["query_start"] = ( + frag["query_start"], + frag["hit_start"], + ) + + return {"qresult": qresult, "hit": hit, "hsp": hsp, "frag": frag} + + def _parse_qresult(self): + """Return QueryResult objects (PRIVATE).""" + # state values, determines what to do for each line + state_EOF = 0 + state_QRES_NEW = 1 + state_QRES_SAME = 3 + state_HIT_NEW = 2 + state_HIT_SAME = 4 + # dummies for initial states + qres_state = None + hit_state = None + file_state = None + # dummies for initial id caches + prev_qid = None + prev_hid = None + # dummies for initial parsed value containers + cur, prev = None, None + hit_list, hsp_list = [], [] + cur_qid = None + cur_hid = None + while True: + # store previous line's parsed values, for every line after the 1st + if cur is not None: + prev = cur + prev_qid = cur_qid + prev_hid = cur_hid + # only parse the line if it's not EOF + if self.line and not self.line.startswith("#"): + cur = self._parse_row() + cur_qid = cur["qresult"]["id"] + cur_hid = cur["hit"]["id"] + else: + file_state = state_EOF + # mock ID values since the line is empty + cur_qid, cur_hid = None, None + + # get the state of hit and qresult + if prev_qid != cur_qid: + qres_state = state_QRES_NEW + else: + qres_state = state_QRES_SAME + # new hits are hits with different ids or hits in a new qresult + if prev_hid != cur_hid or qres_state == state_QRES_NEW: + hit_state = state_HIT_NEW + else: + hit_state = state_HIT_SAME + + # start creating objects after the first line (i.e. prev is filled) + if prev is not None: + # each line is basically an HSP with one HSPFragment + frag = HSPFragment(prev_hid, prev_qid) + for attr, value in prev["frag"].items(): + setattr(frag, attr, value) + hsp = HSP([frag]) + for attr, value in prev["hsp"].items(): + setattr(hsp, attr, value) + hsp_list.append(hsp) + + # create hit object when we've finished parsing all its hsps + # i.e. when hit state is state_HIT_NEW + if hit_state == state_HIT_NEW: + hit = Hit(hsp_list) + for attr, value in prev["hit"].items(): + setattr(hit, attr, value) + hit_list.append(hit) + hsp_list = [] + + # create qresult and yield if we're at a new qresult or EOF + if qres_state == state_QRES_NEW or file_state == state_EOF: + qresult = QueryResult(hit_list, prev_qid) + for attr, value in prev["qresult"].items(): + setattr(qresult, attr, value) + yield qresult + # if current line is EOF, break + if file_state == state_EOF: + break + hit_list = [] + + self.line = self.handle.readline() + + +class Hmmer3DomtabHmmhitParser(Hmmer3DomtabParser): + """HMMER domain table parser using hit coordinates. + + Parser for the HMMER domain table format that assumes HMM profile + coordinates are hit coordinates. + """ + + hmm_as_hit = True + + +class Hmmer3DomtabHmmqueryParser(Hmmer3DomtabParser): + """HMMER domain table parser using query coordinates. + + Parser for the HMMER domain table format that assumes HMM profile + coordinates are query coordinates. + """ + + hmm_as_hit = False + + +class Hmmer3DomtabHmmhitIndexer(Hmmer3TabIndexer): + """HMMER domain table indexer using hit coordinates. + + Indexer class for HMMER domain table output that assumes HMM profile + coordinates are hit coordinates. + """ + + _parser = Hmmer3DomtabHmmhitParser + _query_id_idx = 3 + + +class Hmmer3DomtabHmmqueryIndexer(Hmmer3TabIndexer): + """HMMER domain table indexer using query coordinates. + + Indexer class for HMMER domain table output that assumes HMM profile + coordinates are query coordinates. + """ + + _parser = Hmmer3DomtabHmmqueryParser + _query_id_idx = 3 + + +class Hmmer3DomtabHmmhitWriter: + """HMMER domain table writer using hit coordinates. + + Writer for hmmer3-domtab output format which writes hit coordinates + as HMM profile coordinates. + """ + + hmm_as_hit = True + + def __init__(self, handle): + """Initialize the class.""" + self.handle = handle + + def write_file(self, qresults): + """Write to the handle. + + Returns a tuple of how many QueryResult, Hit, and HSP objects were written. + + """ + handle = self.handle + qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 + + try: + first_qresult = next(qresults) + except StopIteration: + handle.write(self._build_header()) + else: + # write header + handle.write(self._build_header(first_qresult)) + # and then the qresults + for qresult in chain([first_qresult], qresults): + if qresult: + handle.write(self._build_row(qresult)) + qresult_counter += 1 + hit_counter += len(qresult) + hsp_counter += sum(len(hit) for hit in qresult) + frag_counter += sum(len(hit.fragments) for hit in qresult) + + return qresult_counter, hit_counter, hsp_counter, frag_counter + + def _build_header(self, first_qresult=None): + """Return the header string of a domain HMMER table output (PRIVATE).""" + # calculate whitespace required + # adapted from HMMER's source: src/p7_tophits.c#L1157 + if first_qresult: + # qnamew = max(20, len(first_qresult.id)) + qnamew = 20 + tnamew = max(20, len(first_qresult[0].id)) + try: + qaccw = max(10, len(first_qresult.acc)) + taccw = max(10, len(first_qresult[0].acc)) + except AttributeError: + qaccw, taccw = 10, 10 + else: + qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10 + # Turn black code style off + # fmt: off + header = ("#%*s %22s %40s %11s %11s %11s\n" + % (tnamew + qnamew - 1 + 15 + taccw + qaccw, "", "--- full sequence ---", + "-------------- this domain -------------", "hmm coord", + "ali coord", "env coord")) + header += ("#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s " + "%9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" + % (tnamew - 1, + " target name", taccw, "accession", "tlen", qnamew, + "query name", qaccw, "accession", "qlen", "E-value", "score", + "bias", "#", "of", "c-Evalue", "i-Evalue", "score", "bias", + "from", "to", "from", "to", "from", "to", "acc", + "description of target")) + header += ("#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s " + "%6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" + % (tnamew - 1, + "-------------------", taccw, "----------", "-----", + qnamew, "--------------------", qaccw, "----------", + "-----", "---------", "------", "-----", "---", "---", + "---------", "---------", "------", "-----", "-----", "-----", + "-----", "-----", "-----", "-----", "----", + "---------------------")) + # Turn black code style on + # fmt: on + return header + + def _build_row(self, qresult): + """Return a string or one row or more of the QueryResult object (PRIVATE).""" + rows = "" + + # calculate whitespace required + # adapted from HMMER's source: src/p7_tophits.c#L1083 + qnamew = max(20, len(qresult.id)) + tnamew = max(20, len(qresult[0].id)) + try: + qaccw = max(10, len(qresult.accession)) + taccw = max(10, len(qresult[0].accession)) + qresult_acc = qresult.accession + except AttributeError: + qaccw, taccw = 10, 10 + qresult_acc = "-" + + for hit in qresult: + + # try to get hit accession + try: + hit_acc = hit.accession + except AttributeError: + hit_acc = "-" + + for hsp in hit.hsps: + if self.hmm_as_hit: + hmm_to = hsp.hit_end + hmm_from = hsp.hit_start + 1 + ali_to = hsp.query_end + ali_from = hsp.query_start + 1 + else: + hmm_to = hsp.query_end + hmm_from = hsp.query_start + 1 + ali_to = hsp.hit_end + ali_from = hsp.hit_start + 1 + + rows += ( + "%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d" + " %3d %9.2g %9.2g %6.1f %5.1f %5d %5d %5ld %5ld" + " %5d %5d %4.2f %s\n" + % ( + tnamew, + hit.id, + taccw, + hit_acc, + hit.seq_len, + qnamew, + qresult.id, + qaccw, + qresult_acc, + qresult.seq_len, + hit.evalue, + hit.bitscore, + hit.bias, + hsp.domain_index, + len(hit.hsps), + hsp.evalue_cond, + hsp.evalue, + hsp.bitscore, + hsp.bias, + hmm_from, + hmm_to, + ali_from, + ali_to, + hsp.env_start + 1, + hsp.env_end, + hsp.acc_avg, + hit.description, + ) + ) + + return rows + + +class Hmmer3DomtabHmmqueryWriter(Hmmer3DomtabHmmhitWriter): + """HMMER domain table writer using query coordinates. + + Writer for hmmer3-domtab output format which writes query coordinates + as HMM profile coordinates. + """ + + hmm_as_hit = False + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/HmmerIO/hmmer3_tab.py b/code/lib/Bio/SearchIO/HmmerIO/hmmer3_tab.py new file mode 100644 index 0000000..a380732 --- /dev/null +++ b/code/lib/Bio/SearchIO/HmmerIO/hmmer3_tab.py @@ -0,0 +1,335 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for HMMER table output format.""" + +from itertools import chain + +from Bio.SearchIO._index import SearchIndexer +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment + + +__all__ = ("Hmmer3TabParser", "Hmmer3TabIndexer", "Hmmer3TabWriter") + + +class Hmmer3TabParser: + """Parser for the HMMER table format.""" + + def __init__(self, handle): + """Initialize the class.""" + self.handle = handle + self.line = self.handle.readline() + + def __iter__(self): + """Iterate over Hmmer3TabParser, yields query results.""" + header_mark = "#" + # read through the header if it exists + while self.line.startswith(header_mark): + self.line = self.handle.readline() + # if we have result rows, parse it + if self.line: + yield from self._parse_qresult() + + def _parse_row(self): + """Return a dictionary of parsed row values (PRIVATE).""" + cols = [x for x in self.line.strip().split(" ") if x] + if len(cols) < 18: + raise ValueError("Less columns than expected, only %i" % len(cols)) + # if len(cols) > 19, we have extra description columns + # combine them all into one string in the 19th column + cols[18] = " ".join(cols[18:]) + + # assign parsed column data into qresult, hit, and hsp dicts + qresult = {} + qresult["id"] = cols[2] # query name + qresult["accession"] = cols[3] # query accession + hit = {} + hit["id"] = cols[0] # target name + hit["accession"] = cols[1] # target accession + hit["evalue"] = float(cols[4]) # evalue (full sequence) + hit["bitscore"] = float(cols[5]) # score (full sequence) + hit["bias"] = float(cols[6]) # bias (full sequence) + hit["domain_exp_num"] = float(cols[10]) # exp + hit["region_num"] = int(cols[11]) # reg + hit["cluster_num"] = int(cols[12]) # clu + hit["overlap_num"] = int(cols[13]) # ov + hit["env_num"] = int(cols[14]) # env + hit["domain_obs_num"] = int(cols[15]) # dom + hit["domain_reported_num"] = int(cols[16]) # rep + hit["domain_included_num"] = int(cols[17]) # inc + hit["description"] = cols[18] # description of target + hsp = {} + hsp["evalue"] = float(cols[7]) # evalue (best 1 domain) + hsp["bitscore"] = float(cols[8]) # score (best 1 domain) + hsp["bias"] = float(cols[9]) # bias (best 1 domain) + # strand is always 0, since HMMER now only handles protein + frag = {} + frag["hit_strand"] = frag["query_strand"] = 0 + frag["molecule_type"] = "protein" + + return {"qresult": qresult, "hit": hit, "hsp": hsp, "frag": frag} + + def _parse_qresult(self): + """Return QueryResult objects (PRIVATE).""" + # state values, determines what to do for each line + state_EOF = 0 + state_QRES_NEW = 1 + state_QRES_SAME = 3 + # initial value dummies + qres_state = None + file_state = None + prev_qid = None + cur, prev = None, None + # container for Hit objects, used to create QueryResult + hit_list = [] + cur_qid = None + while True: + # store previous line's parsed values for all lines after the first + if cur is not None: + prev = cur + prev_qid = cur_qid + # only parse the result row if it's not EOF + # NOTE: we are not parsing the extra '#' lines appended to the end + # of hmmer31b1 tabular results since storing them in qresult + # objects means we can not do a single-pass parsing + if self.line and not self.line.startswith("#"): + cur = self._parse_row() + cur_qid = cur["qresult"]["id"] + else: + file_state = state_EOF + # mock value for cur_qid, since we have nothing to parse + cur_qid = None + + if prev_qid != cur_qid: + qres_state = state_QRES_NEW + else: + qres_state = state_QRES_SAME + + if prev is not None: + # since domain tab formats only have 1 Hit per line + # we always create HSPFragment, HSP, and Hit per line + prev_hid = prev["hit"]["id"] + + # create fragment and HSP and set their attributes + frag = HSPFragment(prev_hid, prev_qid) + for attr, value in prev["frag"].items(): + setattr(frag, attr, value) + hsp = HSP([frag]) + for attr, value in prev["hsp"].items(): + setattr(hsp, attr, value) + + # create Hit and set its attributes + hit = Hit([hsp]) + for attr, value in prev["hit"].items(): + setattr(hit, attr, value) + hit_list.append(hit) + + # create qresult and yield if we're at a new qresult or at EOF + if qres_state == state_QRES_NEW or file_state == state_EOF: + qresult = QueryResult(hit_list, prev_qid) + for attr, value in prev["qresult"].items(): + setattr(qresult, attr, value) + yield qresult + # if we're at EOF, break + if file_state == state_EOF: + break + hit_list = [] + + self.line = self.handle.readline() + + +class Hmmer3TabIndexer(SearchIndexer): + """Indexer class for HMMER table output.""" + + _parser = Hmmer3TabParser + # denotes column location for query identifier + _query_id_idx = 2 + + def __iter__(self): + """Iterate over the file handle; yields key, start offset, and length.""" + handle = self._handle + handle.seek(0) + query_id_idx = self._query_id_idx + qresult_key = None + header_mark = b"#" + split_mark = b" " + # set line with initial mock value, to emulate header + line = header_mark + + # read through header + while line.startswith(header_mark): + start_offset = handle.tell() + line = handle.readline() + + # and index the qresults + while True: + end_offset = handle.tell() + + if not line: + break + + cols = [x for x in line.strip().split(split_mark) if x] + if qresult_key is None: + qresult_key = cols[query_id_idx] + else: + curr_key = cols[query_id_idx] + + if curr_key != qresult_key: + adj_end = end_offset - len(line) + yield (qresult_key.decode(), start_offset, adj_end - start_offset) + qresult_key = curr_key + start_offset = adj_end + + line = handle.readline() + if not line: + yield (qresult_key.decode(), start_offset, end_offset - start_offset) + break + + def get_raw(self, offset): + """Return the raw bytes string of a QueryResult object from the given offset.""" + handle = self._handle + handle.seek(offset) + query_id_idx = self._query_id_idx + qresult_key = None + qresult_raw = b"" + split_mark = b" " + + while True: + line = handle.readline() + if not line: + break + cols = [x for x in line.strip().split(split_mark) if x] + if qresult_key is None: + qresult_key = cols[query_id_idx] + else: + curr_key = cols[query_id_idx] + if curr_key != qresult_key: + break + qresult_raw += line + + return qresult_raw + + +class Hmmer3TabWriter: + """Writer for hmmer3-tab output format.""" + + def __init__(self, handle): + """Initialize the class.""" + self.handle = handle + + def write_file(self, qresults): + """Write to the handle. + + Returns a tuple of how many QueryResult, Hit, and HSP objects were written. + + """ + handle = self.handle + qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 + + try: + first_qresult = next(qresults) + except StopIteration: + handle.write(self._build_header()) + else: + # write header + handle.write(self._build_header(first_qresult)) + # and then the qresults + for qresult in chain([first_qresult], qresults): + if qresult: + handle.write(self._build_row(qresult)) + qresult_counter += 1 + hit_counter += len(qresult) + hsp_counter += sum(len(hit) for hit in qresult) + frag_counter += sum(len(hit.fragments) for hit in qresult) + + return qresult_counter, hit_counter, hsp_counter, frag_counter + + def _build_header(self, first_qresult=None): + """Return the header string of a HMMER table output (PRIVATE).""" + # calculate whitespace required + # adapted from HMMER's source: src/p7_tophits.c#L1083 + if first_qresult is not None: + # qnamew = max(20, len(first_qresult.id)) + qnamew = 20 # why doesn't the above work? + tnamew = max(20, len(first_qresult[0].id)) + qaccw = max(10, len(first_qresult.accession)) + taccw = max(10, len(first_qresult[0].accession)) + else: + qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10 + # Turn black code style off + # fmt: off + header = ("#%*s %22s %22s %33s\n" + % (tnamew + qnamew + taccw + qaccw + 2, "", + "--- full sequence ----", "--- best 1 domain ----", + "--- domain number estimation ----")) + header += ("#%-*s %-*s %-*s %-*s %9s %6s %5s %9s %6s %5s %5s %3s " + "%3s %3s %3s %3s %3s %3s %s\n" + % (tnamew - 1, " target name", + taccw, "accession", qnamew, "query name", qaccw, + "accession", " E-value", " score", " bias", + " E-value", " score", " bias", "exp", + "reg", "clu", " ov", "env", "dom", "rep", + "inc", "description of target")) + header += ("#%*s %*s %*s %*s %9s %6s %5s %9s %6s %5s %5s %3s %3s " + "%3s %3s %3s %3s %3s %s\n" + % (tnamew - 1, "-------------------", + taccw, "----------", qnamew, "--------------------", qaccw, + "----------", "---------", "------", "-----", "---------", + "------", "-----", "---", "---", "---", "---", "---", "---", + "---", "---", "---------------------")) + # Turn black code style on + # fmt: on + return header + + def _build_row(self, qresult): + """Return a string or one row or more of the QueryResult object (PRIVATE).""" + rows = "" + + # calculate whitespace required + # adapted from HMMER's source: src/p7_tophits.c#L1083 + qnamew = max(20, len(qresult.id)) + tnamew = max(20, len(qresult[0].id)) + qaccw = max(10, len(qresult.accession)) + taccw = max(10, len(qresult[0].accession)) + + for hit in qresult: + rows += ( + "%-*s %-*s %-*s %-*s %9.2g %6.1f %5.1f %9.2g %6.1f" + " %5.1f %5.1f %3d %3d %3d %3d %3d %3d %3d %s\n" + % ( + tnamew, + hit.id, + taccw, + hit.accession, + qnamew, + qresult.id, + qaccw, + qresult.accession, + hit.evalue, + hit.bitscore, + hit.bias, + hit.hsps[0].evalue, + hit.hsps[0].bitscore, + hit.hsps[0].bias, + hit.domain_exp_num, + hit.region_num, + hit.cluster_num, + hit.overlap_num, + hit.env_num, + hit.domain_obs_num, + hit.domain_reported_num, + hit.domain_included_num, + hit.description, + ) + ) + + return rows + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/HmmerIO/hmmer3_text.py b/code/lib/Bio/SearchIO/HmmerIO/hmmer3_text.py new file mode 100644 index 0000000..9cc087c --- /dev/null +++ b/code/lib/Bio/SearchIO/HmmerIO/hmmer3_text.py @@ -0,0 +1,436 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO parser for HMMER plain text output format.""" + +import re + +from Bio.SearchIO._utils import read_forward +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment + +from ._base import _BaseHmmerTextIndexer + +__all__ = ("Hmmer3TextParser", "Hmmer3TextIndexer") + + +# precompile regex patterns for faster processing +# regex for program name capture +_RE_PROGRAM = re.compile(r"^# (\w*hmm\w+) :: .*$") +# regex for version string capture +_RE_VERSION = re.compile(r"# \w+ ([\w+\.]+) .*; http.*$") +# regex for option string capture +_RE_OPT = re.compile(r"^# (.+):\s+(.+)$") +# regex for parsing query id and length, for parsing +_QRE_ID_LEN_PTN = r"^Query:\s*(.*)\s+\[\w=(\d+)\]" +_QRE_ID_LEN = re.compile(_QRE_ID_LEN_PTN) +# regex for hsp validation +_HRE_VALIDATE = re.compile(r"score:\s(-?\d+\.?\d+)\sbits.*value:\s(.*)") +# regexes for parsing hsp alignment blocks +_HRE_ANNOT_LINE = re.compile(r"^(\s+)(.+)\s(\w+)") +_HRE_ID_LINE = re.compile(r"^(\s+\S+\s+[0-9-]+ )(.+?)(\s+[0-9-]+)") + + +class Hmmer3TextParser: + """Parser for the HMMER 3.0 text output.""" + + def __init__(self, handle): + """Initialize the class.""" + self.handle = handle + self.line = read_forward(self.handle) + self._meta = self._parse_preamble() + + def __iter__(self): + """Iterate over query results.""" + yield from self._parse_qresult() + + def _read_until(self, bool_func): + """Read the file handle until the given function returns True (PRIVATE).""" + while True: + if not self.line or bool_func(self.line): + return + else: + self.line = read_forward(self.handle) + + def _parse_preamble(self): + """Parse HMMER preamble (lines beginning with '#') (PRIVATE).""" + meta = {} + # bool flag for storing state ~ whether we are parsing the option + # lines or not + has_opts = False + while True: + # no pound sign means we've left the preamble + if not self.line.startswith("#"): + break + # dashes could either mean we are entering or leaving the options + # section ~ so it's a switch for the has_opts flag + elif "- - -" in self.line: + if not has_opts: + # if flag is false, that means we're entering opts + # so switch the flag accordingly + has_opts = True + else: + # if flag is true, that means we've reached the end of opts + # so we can break out of the function + break + elif not has_opts: + # try parsing program + regx = re.search(_RE_PROGRAM, self.line) + if regx: + meta["program"] = regx.group(1) + # try parsing version + regx = re.search(_RE_VERSION, self.line) + if regx: + meta["version"] = regx.group(1) + elif has_opts: + regx = re.search(_RE_OPT, self.line) + # if target in regx.group(1), then we store the key as target + if "target" in regx.group(1): + meta["target"] = regx.group(2).strip() + else: + meta[regx.group(1)] = regx.group(2) + + self.line = read_forward(self.handle) + + return meta + + def _parse_qresult(self): + """Parse a HMMER3 query block (PRIVATE).""" + self._read_until(lambda line: line.startswith("Query:")) + + while self.line: + + regx = re.search(_QRE_ID_LEN, self.line) + + while not regx: + self.line = read_forward(self.handle) + regx = re.search(_QRE_ID_LEN, self.line) + + # get query id and length + qid = regx.group(1).strip() + # store qresult attributes + qresult_attrs = { + "seq_len": int(regx.group(2)), + "program": self._meta.get("program"), + "version": self._meta.get("version"), + "target": self._meta.get("target"), + } + + # get description and accession, if they exist + qdesc = "" # placeholder + while not self.line.startswith("Scores for "): + self.line = read_forward(self.handle) + + if self.line.startswith("Accession:"): + acc = self.line.strip().split(" ", 1)[1] + qresult_attrs["accession"] = acc.strip() + elif self.line.startswith("Description:"): + qdesc = self.line.strip().split(" ", 1)[1].strip() + qresult_attrs["description"] = qdesc + + # parse the query hits + while self.line and "//" not in self.line: + hit_list = self._parse_hit(qid, qdesc) + # read through the statistics summary + # TODO: parse and store this information? + if self.line.startswith("Internal pipeline"): + while self.line and "//" not in self.line: + self.line = read_forward(self.handle) + + # create qresult, set its attributes and yield + # not initializing hit_list directly to handle empty hits + # (i.e. need to set its query description manually) + qresult = QueryResult(id=qid, hits=hit_list) + for attr, value in qresult_attrs.items(): + setattr(qresult, attr, value) + yield qresult + self.line = read_forward(self.handle) + + # Skip line beginning with '# Alignment of', which are output + # when running phmmer with the '-A' flag. + if self.line.startswith("#"): + self.line = self.handle.readline() + + # HMMER >= 3.1 outputs '[ok]' at the end of all results file, + # which means we can break the main loop when we see the line + if "[ok]" in self.line: + break + + def _parse_hit(self, qid, qdesc): + """Parse a HMMER3 hit block, beginning with the hit table (PRIVATE).""" + # get to the end of the hit table delimiter and read one more line + self._read_until(lambda line: line.startswith(" ------- ------ -----")) + self.line = read_forward(self.handle) + + # assume every hit is in inclusion threshold until the inclusion + # threshold line is encountered + is_included = True + + # parse the hit table + hit_attr_list = [] + while True: + if not self.line: + return [] + elif self.line.startswith(" ------ inclusion"): + is_included = False + self.line = read_forward(self.handle) + # if there are no hits, then there are no hsps + # so we forward-read until 'Internal pipeline..' + elif self.line.startswith(" [No hits detected that satisfy reporting"): + while True: + self.line = read_forward(self.handle) + if self.line.startswith("Internal pipeline"): + assert len(hit_attr_list) == 0 + return [] + elif self.line.startswith("Domain annotation for each "): + hit_list = self._create_hits(hit_attr_list, qid, qdesc) + return hit_list + # entering hit results row + # parse the columns into a list + row = [x for x in self.line.strip().split(" ") if x] + # join the description words if it's >1 word + if len(row) > 10: + row[9] = " ".join(row[9:]) + # if there's no description, set it to an empty string + elif len(row) < 10: + row.append("") + assert len(row) == 10 + # create the hit object + hit_attrs = { + "id": row[8], + "query_id": qid, + "evalue": float(row[0]), + "bitscore": float(row[1]), + "bias": float(row[2]), + # row[3:6] is not parsed, since the info is available + # at the HSP level + "domain_exp_num": float(row[6]), + "domain_obs_num": int(row[7]), + "description": row[9], + "is_included": is_included, + } + hit_attr_list.append(hit_attrs) + + self.line = read_forward(self.handle) + + def _create_hits(self, hit_attrs, qid, qdesc): + """Parse a HMMER3 hsp block, beginning with the hsp table (PRIVATE).""" + # read through until the beginning of the hsp block + self._read_until( + lambda line: line.startswith("Internal pipeline") or line.startswith(">>") + ) + + # start parsing the hsp block + hit_list = [] + while True: + if self.line.startswith("Internal pipeline"): + # by this time we should've emptied the hit attr list + assert len(hit_attrs) == 0 + return hit_list + assert self.line.startswith(">>") + hid, hdesc = self.line[len(">> ") :].split(" ", 1) + hdesc = hdesc.strip() + + # read through the hsp table header and move one more line + self._read_until( + lambda line: line.startswith(" --- ------ ----- --------") + or line.startswith(" [No individual domains") + ) + self.line = read_forward(self.handle) + + # parse the hsp table for the current hit + hsp_list = [] + while True: + # break out of hsp parsing if there are no hits, it's the last hsp + # or it's the start of a new hit + if ( + self.line.startswith(" [No targets detected that satisfy") + or self.line.startswith(" [No individual domains") + or self.line.startswith("Internal pipeline statistics summary:") + or self.line.startswith(" Alignments for each domain:") + or self.line.startswith(">>") + ): + + hit_attr = hit_attrs.pop(0) + hit = Hit(hsp_list) + for attr, value in hit_attr.items(): + if attr == "description": + cur_val = getattr(hit, attr) + if cur_val and value and cur_val.startswith(value): + continue + setattr(hit, attr, value) + if not hit: + hit.query_description = qdesc + hit_list.append(hit) + break + + parsed = [x for x in self.line.strip().split(" ") if x] + assert len(parsed) == 16 + # parsed column order: + # index, is_included, bitscore, bias, evalue_cond, evalue + # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, + # envfrom, envto, acc_avg + frag = HSPFragment(hid, qid) + # set query and hit descriptions if they are defined / nonempty string + if qdesc: + frag.query_description = qdesc + if hdesc: + frag.hit_description = hdesc + # HMMER3 results are always protein + frag.molecule_type = "protein" + # depending on whether the program is hmmsearch, hmmscan, or phmmer + # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} + # for hmmscan, hit is the hmm profile, query is the sequence + if self._meta.get("program") == "hmmscan": + # adjust 'from' and 'to' coordinates to 0-based ones + frag.hit_start = int(parsed[6]) - 1 + frag.hit_end = int(parsed[7]) + frag.query_start = int(parsed[9]) - 1 + frag.query_end = int(parsed[10]) + elif self._meta.get("program") in ["hmmsearch", "phmmer"]: + # adjust 'from' and 'to' coordinates to 0-based ones + frag.hit_start = int(parsed[9]) - 1 + frag.hit_end = int(parsed[10]) + frag.query_start = int(parsed[6]) - 1 + frag.query_end = int(parsed[7]) + # strand is always 0, since HMMER now only handles protein + frag.hit_strand = frag.query_strand = 0 + + hsp = HSP([frag]) + hsp.domain_index = int(parsed[0]) + hsp.is_included = parsed[1] == "!" + hsp.bitscore = float(parsed[2]) + hsp.bias = float(parsed[3]) + hsp.evalue_cond = float(parsed[4]) + hsp.evalue = float(parsed[5]) + if self._meta.get("program") == "hmmscan": + # adjust 'from' and 'to' coordinates to 0-based ones + hsp.hit_endtype = parsed[8] + hsp.query_endtype = parsed[11] + elif self._meta.get("program") in ["hmmsearch", "phmmer"]: + # adjust 'from' and 'to' coordinates to 0-based ones + hsp.hit_endtype = parsed[11] + hsp.query_endtype = parsed[8] + # adjust 'from' and 'to' coordinates to 0-based ones + hsp.env_start = int(parsed[12]) - 1 + hsp.env_end = int(parsed[13]) + hsp.env_endtype = parsed[14] + hsp.acc_avg = float(parsed[15]) + + hsp_list.append(hsp) + self.line = read_forward(self.handle) + + # parse the hsp alignments + if self.line.startswith(" Alignments for each domain:"): + self._parse_aln_block(hid, hit.hsps) + + def _parse_aln_block(self, hid, hsp_list): + """Parse a HMMER3 HSP alignment block (PRIVATE).""" + self.line = read_forward(self.handle) + dom_counter = 0 + while True: + if self.line.startswith(">>") or self.line.startswith("Internal pipeline"): + return hsp_list + assert self.line.startswith(" == domain %i" % (dom_counter + 1)) + # alias hsp to local var + # but note that we're still changing the attrs of the actual + # hsp inside the qresult as we're not creating a copy + frag = hsp_list[dom_counter][0] + # XXX: should we validate again here? regex is expensive.. + # regx = re.search(_HRE_VALIDATE, self.line) + # assert hsp.bitscore == float(regx.group(1)) + # assert hsp.evalue_cond == float(regx.group(2)) + hmmseq = "" + aliseq = "" + annot = {} + self.line = self.handle.readline() + + # parse all the alignment blocks in the hsp + while True: + + regx = None + + # check for hit or query line + # we don't check for the hit or query id specifically + # to anticipate special cases where query id == hit id + regx = re.search(_HRE_ID_LINE, self.line) + if regx: + # the first hit/query self.line we encounter is the hmmseq + if len(hmmseq) == len(aliseq): + hmmseq += regx.group(2) + # and for subsequent self.lines, len(hmmseq) is either + # > or == len(aliseq) + elif len(hmmseq) > len(aliseq): + aliseq += regx.group(2) + assert len(hmmseq) >= len(aliseq) + # check for start of new domain + elif ( + self.line.startswith(" == domain") + or self.line.startswith(">>") + or self.line.startswith("Internal pipeline") + ): + frag.aln_annotation = annot + if self._meta.get("program") == "hmmscan": + frag.hit = hmmseq + frag.query = aliseq + elif self._meta.get("program") in ["hmmsearch", "phmmer"]: + frag.hit = aliseq + frag.query = hmmseq + dom_counter += 1 + hmmseq = "" + aliseq = "" + annot = {} + break + # otherwise check if it's an annotation line and parse it + # len(hmmseq) is only != len(aliseq) when the cursor is parsing + # the similarity character. Since we're not parsing that, we + # check for when the condition is False (i.e. when it's ==) + elif len(hmmseq) == len(aliseq): + regx = re.search(_HRE_ANNOT_LINE, self.line) + if regx: + annot_name = regx.group(3) + if annot_name in annot: + annot[annot_name] += regx.group(2) + else: + annot[annot_name] = regx.group(2) + + self.line = self.handle.readline() + + +class Hmmer3TextIndexer(_BaseHmmerTextIndexer): + """Indexer class for HMMER plain text output.""" + + _parser = Hmmer3TextParser + qresult_start = b"Query: " + qresult_end = b"//" + + def __iter__(self): + """Iterate over Hmmer3TextIndexer; yields query results' key, offsets, 0.""" + handle = self._handle + handle.seek(0) + start_offset = handle.tell() + regex_id = re.compile(_QRE_ID_LEN_PTN.encode()) + + while True: + line = read_forward(handle) + end_offset = handle.tell() + + if line.startswith(self.qresult_start): + regx = re.search(regex_id, line) + qresult_key = regx.group(1).strip() + # qresult start offset is the offset of this line + # (starts with the start mark) + start_offset = end_offset - len(line) + elif line.startswith(self.qresult_end): + yield qresult_key.decode(), start_offset, 0 + start_offset = end_offset + elif not line: + break + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/InterproscanIO/__init__.py b/code/lib/Bio/SearchIO/InterproscanIO/__init__.py new file mode 100644 index 0000000..620a519 --- /dev/null +++ b/code/lib/Bio/SearchIO/InterproscanIO/__init__.py @@ -0,0 +1,96 @@ +# Copyright 2018 by Adhemar Zerlotini. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Bio.SearchIO support for InterProScan output formats. + +This module adds support for parsing InterProScan XML output. +The InterProScan is available as a command line program or on +EMBL-EBI's web page. +Bio.SearchIO.InterproscanIO was tested on the following version: + +- versions: 5.26-65.0 (interproscan-model-2.1.xsd) + +More information about InterProScan are available through these links: +- Publication: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3998142/ +- Web interface: https://www.ebi.ac.uk/interpro/search/sequence-search +- Documentation: https://github.com/ebi-pf-team/interproscan/wiki + + +Supported format +================ + +Bio.SearchIO.InterproscanIO supports the following format: + +- XML - 'interproscan-xml' - parsing + + +interproscan-xml +================ + +The interproscan-xml parser follows the InterProScan XML described here: +https://github.com/ebi-pf-team/interproscan/wiki/OutputFormats + ++--------------+--------------------+------------------------------------------+ +| Object | Attribute | XML Element | ++==============+====================+==========================================+ +| QueryResult | target | ``InterPro`` | +| +--------------------+------------------------------------------+ +| | program | ``InterProScan`` | +| +--------------------+------------------------------------------+ +| | version | ``protein-matches.interproscan-version`` | ++--------------+--------------------+------------------------------------------+ +| Hit | accession | ``signature.name`` | +| +--------------------+------------------------------------------+ +| | id | ``signature.ac`` | +| +--------------------+------------------------------------------+ +| | description | ``signature.desc`` | +| +--------------------+------------------------------------------+ +| | dbxrefs | ``IPR:entry.ac`` | +| | | ``go-xref.id`` | +| | | ``pathway-xref.db:pathway-xref.id`` | +| +--------------------+------------------------------------------+ +| | attributes | | +| | ['Target'] | ``*-match`` / ``*-location`` | +| | ['Target version'] | ``signature-library-release.library`` | +| | ['Hit type'] | ``signature-library-release.version`` | ++--------------+--------------------+------------------------------------------+ +| HSP | bitscore | ``*-location.score`` | +| +--------------------+------------------------------------------+ +| | evalue | ``*-location.evalue`` | ++--------------+--------------------+------------------------------------------+ +| HSPFragment | query_start | ``*-location.start`` | +| (also via +--------------------+------------------------------------------+ +| HSP) | query_end | ``*-location.end`` | +| +--------------------+------------------------------------------+ +| | hit_start | ``*-location.hmm-start`` | +| +--------------------+------------------------------------------+ +| | hit_end | ``*-location.hmm-end`` | +| +--------------------+------------------------------------------+ +| | query | ``sequence`` | ++--------------+--------------------+------------------------------------------+ + +InterProScan XML files may contain a match with multiple locations or multiple +matches to the same protein with a single location. In both cases, the match +is uniquely stored as a HIT object and the locations as HSP objects. + +``HSP.*start == *start - 1`` (Since every start position is 0-based in Biopython) + +``HSP.aln_span == query-end - query-start`` + +The types of matches or locations (eg. hmmer3-match, hmmer3-location, +coils-match, panther-location) are stored in hit.attributes['Hit type']. +For instance, for every 'phobious-match', there will be a 'phobious-location'. +Therefore, Hit.type will store the string excluding '-match' or '-location' +('phobious', in this example). +""" + +from .interproscan_xml import InterproscanXmlParser + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/InterproscanIO/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SearchIO/InterproscanIO/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..8571d35 Binary files /dev/null and b/code/lib/Bio/SearchIO/InterproscanIO/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/InterproscanIO/__pycache__/interproscan_xml.cpython-37.pyc b/code/lib/Bio/SearchIO/InterproscanIO/__pycache__/interproscan_xml.cpython-37.pyc new file mode 100644 index 0000000..bde264a Binary files /dev/null and b/code/lib/Bio/SearchIO/InterproscanIO/__pycache__/interproscan_xml.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/InterproscanIO/interproscan_xml.py b/code/lib/Bio/SearchIO/InterproscanIO/interproscan_xml.py new file mode 100644 index 0000000..97625b0 --- /dev/null +++ b/code/lib/Bio/SearchIO/InterproscanIO/interproscan_xml.py @@ -0,0 +1,194 @@ +# Copyright 2018 by Adhemar Zerlotini. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Bio.SearchIO parser for InterProScan XML output formats.""" +# for more info: https://github.com/ebi-pf-team/interproscan/wiki/OutputFormats + +import re +from xml.etree import ElementTree + +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment + + +# element - hit attribute name mapping +_ELEM_HIT = { + "name": ("accession", str), + "ac": ("id", str), + "desc": ("description", str), +} +# element - hsp attribute name mapping +_ELEM_HSP = {"score": ("bitscore", float), "evalue": ("evalue", float)} +# element - fragment attribute name mapping +_ELEM_FRAG = { + "start": ("query_start", int), + "end": ("query_end", int), + "hmm-start": ("hit_start", int), + "hmm-end": ("hit_end", int), +} + + +class InterproscanXmlParser: + """Parser for the InterProScan XML format.""" + + def __init__(self, handle): + """Initialize the class.""" + self.xml_iter = iter(ElementTree.iterparse(handle, events=("start", "end"))) + self._meta = self._parse_header() + + def __iter__(self): + """Iterate qresults.""" + yield from self._parse_qresult() + + def _parse_header(self): + """Parse the header for the InterProScan version (PRIVATE).""" + event, elem = next(self.xml_iter) + meta = {} + meta["target"] = "InterPro" + meta["program"] = "InterProScan" + meta["version"] = elem.attrib["interproscan-version"] + # store the namespace value + self.NS = re.sub("protein-matches", "", elem.tag) + return meta + + def _parse_qresult(self): + """Parse query results (PRIVATE).""" + for event, elem in self.xml_iter: + if event == "end" and elem.tag == self.NS + "protein": + # store the query sequence + seq = elem.find(self.NS + "sequence") + query_seq = seq.text + + # store the query id and description + xref = elem.find(self.NS + "xref") + query_id = xref.attrib["id"] + query_desc = xref.attrib["name"] + + # parse each hit + hit_list = [] + for hit_new in self._parse_hit( + elem.find(self.NS + "matches"), query_id, query_seq + ): + # interproscan results contain duplicate hits rather than + # a single hit with multiple hsps. In this case the hsps + # of a duplicate hit will be appended to the already + # existing hit + for hit in hit_list: + if hit.id == hit_new.id: + for hsp in hit_new.hsps: + hit.hsps.append(hsp) + break + else: + hit_list.append(hit_new) + + # create qresult and assing attributes + qresult = QueryResult(hit_list, query_id) + setattr(qresult, "description", query_desc) + for key, value in self._meta.items(): + setattr(qresult, key, value) + yield qresult + + def _parse_hit(self, root_hit_elem, query_id, query_seq=None): + """Parse hit (PRIVATE).""" + # feed the loop below an empty list so iteration still works + if root_hit_elem is None: + root_hit_elem = [] + + for hit_elem in root_hit_elem: + # store the match/location type + hit_type = re.sub(r"%s(\w+)-match" % self.NS, r"\1", hit_elem.find(".").tag) + # store the hit id + signature = hit_elem.find(self.NS + "signature") + hit_id = signature.attrib["ac"] + + # store xrefs and alt_descs + xrefs = self._parse_xrefs(signature.find(self.NS + "entry")) + + # parse each hsp + hsps = list( + self._parse_hsp( + hit_elem.find(self.NS + "locations"), query_id, hit_id, query_seq + ) + ) + + # create hit and assign attributes + hit = Hit(hsps, hit_id) + setattr(hit, "dbxrefs", xrefs) + for key, (attr, caster) in _ELEM_HIT.items(): + value = signature.attrib.get(key) + if value is not None: + setattr(hit, attr, caster(value)) + # format specific attributes + hit.attributes["Hit type"] = hit_type + signature_lib = signature.find(self.NS + "signature-library-release") + hit.attributes["Target"] = str(signature_lib.attrib.get("library")) + hit.attributes["Target version"] = str(signature_lib.attrib.get("version")) + + yield hit + + def _parse_hsp(self, root_hsp_elem, query_id, hit_id, query_seq=None): + """Parse hsp (PRIVATE).""" + # feed the loop below an empty list so iteration still works + if root_hsp_elem is None: + root_hsp_elem = [] + + for hsp_elem in root_hsp_elem: + # create frag and assign attributes + frag = HSPFragment(hit_id, query_id) + setattr(frag, "molecule_type", "protein") + if query_seq is not None: + setattr(frag, "query", query_seq) + for key, (attr, caster) in _ELEM_FRAG.items(): + value = hsp_elem.attrib.get(key) + if value is not None: + # start should be 0-based + if attr.endswith("start"): + value = caster(value) - 1 + # store query start and end to calculate aln_span + if attr == "query_start": + start = int(value) + if attr == "query_end": + end = int(value) + setattr(frag, attr, caster(value)) + # calculate aln_span and store + setattr(frag, "aln_span", end - start) + + # create hsp and assign attributes + hsp = HSP([frag]) + setattr(hsp, "query_id", query_id) + setattr(hsp, "hit_id", hit_id) + for key, (attr, caster) in _ELEM_HSP.items(): + value = hsp_elem.attrib.get(key) + if value is not None: + setattr(hsp, attr, caster(value)) + yield hsp + + def _parse_xrefs(self, root_entry_elem): + """Parse xrefs (PRIVATE).""" + xrefs = [] + # store entry id and description + if root_entry_elem is not None: + xrefs.append("IPR:" + root_entry_elem.attrib["ac"]) + + # store go-xrefs and pathway-refs id and description + if root_entry_elem is not None: + xref_elems = [] + xref_elems = xref_elems + root_entry_elem.findall(self.NS + "go-xref") + xref_elems = xref_elems + root_entry_elem.findall(self.NS + "pathway-xref") + + for entry in xref_elems: + xref = entry.attrib["id"] + if ":" not in xref: + xref = entry.attrib["db"] + ":" + xref + xrefs.append(xref) + return xrefs + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/__init__.py b/code/lib/Bio/SearchIO/__init__.py new file mode 100644 index 0000000..1b0084a --- /dev/null +++ b/code/lib/Bio/SearchIO/__init__.py @@ -0,0 +1,684 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Biopython interface for sequence search program outputs. + +The SearchIO submodule provides parsers, indexers, and writers for outputs from +various sequence search programs. It provides an API similar to SeqIO and +AlignIO, with the following main functions: ``parse``, ``read``, ``to_dict``, +``index``, ``index_db``, ``write``, and ``convert``. + +SearchIO parses a search output file's contents into a hierarchy of four nested +objects: QueryResult, Hit, HSP, and HSPFragment. Each of them models a part of +the search output file: + + - QueryResult represents a search query. This is the main object returned + by the input functions and it contains all other objects. + - Hit represents a database hit, + - HSP represents high-scoring alignment region(s) in the hit, + - HSPFragment represents a contiguous alignment within the HSP + +In addition to the four objects above, SearchIO is also tightly integrated with +the SeqRecord objects (see SeqIO) and MultipleSeqAlignment objects (see +AlignIO). SeqRecord objects are used to store the actual matching hit and query +sequences, while MultipleSeqAlignment objects stores the alignment between them. + +A detailed description of these objects' features and their example usages are +available in their respective documentations. + + +Input +===== +The main function for parsing search output files is Bio.SearchIO.parse(...). +This function parses a given search output file and returns a generator object +that yields one QueryResult object per iteration. + +``parse`` takes two arguments: 1) a file handle or a filename of the input file +(the search output file) and 2) the format name. + + >>> from Bio import SearchIO + >>> for qresult in SearchIO.parse('Blast/mirna.xml', 'blast-xml'): + ... print("%s %s" % (qresult.id, qresult.description)) + ... + 33211 mir_1 + 33212 mir_2 + 33213 mir_3 + +SearchIO also provides the Bio.SearchIO.read(...) function, which is intended +for use on search output files containing only one query. ``read`` returns one +QueryResult object and will raise an exception if the source file contains more +than one queries: + + >>> qresult = SearchIO.read('Blast/xml_2226_blastp_004.xml', 'blast-xml') + >>> print("%s %s" % (qresult.id, qresult.description)) + ... + gi|11464971:4-101 pleckstrin [Mus musculus] + + >>> SearchIO.read('Blast/mirna.xml', 'blast-xml') + Traceback (most recent call last): + ... + ValueError: ... + +For accessing search results of large output files, you may use the indexing +functions Bio.SearchIO.index(...) or Bio.SearchIO.index_db(...). They have a +similar interface to their counterparts in SeqIO and AlignIO, with the addition +of optional, format-specific keyword arguments. + + +Output +====== +SearchIO has writing support for several formats, accessible from the +Bio.SearchIO.write(...) function. This function returns a tuple of four +numbers: the number of QueryResult, Hit, HSP, and HSPFragment written:: + + qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml') + SearchIO.write(qresults, 'results.tab', 'blast-tab') + (3, 239, 277, 277) + +Note that different writers may require different attribute values of the +SearchIO objects. This limits the scope of writable search results to search +results possessing the required attributes. + +For example, the writer for HMMER domain table output requires +the conditional e-value attribute from each HSP object, among others. If you +try to write to the HMMER domain table format and your HSPs do not have this +attribute, an exception will be raised. + + +Conversion +========== +SearchIO provides a shortcut function Bio.SearchIO.convert(...) to convert a +given file into another format. Under the hood, ``convert`` simply parses a given +output file and writes it to another using the ``parse`` and ``write`` functions. + +Note that the same restrictions found in Bio.SearchIO.write(...) applies to the +convert function as well. + + +Conventions +=========== +The main goal of creating SearchIO is to have a common, easy to use interface +across different search output files. As such, we have also created some +conventions / standards for SearchIO that extend beyond the common object model. +These conventions apply to all files parsed by SearchIO, regardless of their +individual formats. + +Python-style sequence coordinates +--------------------------------- + +When storing sequence coordinates (start and end values), SearchIO uses +the Python-style slice convention: zero-based and half-open intervals. For +example, if in a BLAST XML output file the start and end coordinates of an +HSP are 10 and 28, they would become 9 and 28 in SearchIO. The start +coordinate becomes 9 because Python indices start from zero, while the end +coordinate remains 28 as Python slices omit the last item in an interval. + +Beside giving you the benefits of standardization, this convention also +makes the coordinates usable for slicing sequences. For example, given a +full query sequence and the start and end coordinates of an HSP, one can +use the coordinates to extract part of the query sequence that results in +the database hit. + +When these objects are written to an output file using +SearchIO.write(...), the coordinate values are restored to their +respective format's convention. Using the example above, if the HSP would +be written to an XML file, the start and end coordinates would become 10 +and 28 again. + +Sequence coordinate order +------------------------- + +Some search output format reverses the start and end coordinate sequences +according to the sequence's strand. For example, in BLAST plain text +format if the matching strand lies in the minus orientation, then the +start coordinate will always be bigger than the end coordinate. + +In SearchIO, start coordinates are always smaller than the end +coordinates, regardless of their originating strand. This ensures +consistency when using the coordinates to slice full sequences. + +Note that this coordinate order convention is only enforced in the +HSPFragment level. If an HSP object has several HSPFragment objects, each +individual fragment will conform to this convention. But the order of the +fragments within the HSP object follows what the search output file uses. + +Similar to the coordinate style convention, the start and end coordinates' +order are restored to their respective formats when the objects are +written using Bio.SearchIO.write(...). + +Frames and strand values +------------------------ + +SearchIO only allows -1, 0, 1 and None as strand values. For frames, the +only allowed values are integers from -3 to 3 (inclusive) and None. Both +of these are standard Biopython conventions. + + +Supported Formats +================= +Below is a list of search program output formats supported by SearchIO. + +Support for parsing, indexing, and writing: + + - blast-tab - BLAST+ tabular output. Both variants without comments + (-m 6 flag) and with comments (-m 7 flag) are supported. + - blast-xml - BLAST+ XML output. + - blat-psl - The default output of BLAT (PSL format). Variants with or + without header are both supported. PSLX (PSL + sequences) + is also supported. + - hmmer3-tab - HMMER3 table output. + - hmmer3-domtab - HMMER3 domain table output. When using this format, the + program name has to be specified. For example, for parsing + hmmscan output, the name would be 'hmmscan-domtab'. + +Support for parsing and indexing: + + - exonerate-text - Exonerate plain text output. + - exonerate-vulgar - Exonerate vulgar line. + - exonerate-cigar - Exonerate cigar line. + - fasta-m10 - Bill Pearson's FASTA -m 10 output. + - hmmer3-text - HMMER3 regular text output format. Supported HMMER3 + subprograms are hmmscan, hmmsearch, and phmmer. + - hmmer2-text - HMMER2 regular text output format. Supported HMMER2 + subprograms are hmmpfam, hmmsearch. + +Support for parsing: + + - blast-text - BLAST+ plain text output. + - hhsuite2-text - HHSUITE plain text output. + +Each of these formats have different keyword arguments available for use with +the main SearchIO functions. More details and examples are available in each +of the format's documentation. + +""" + +from Bio.File import as_handle +from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment +from Bio.SearchIO._utils import get_processor + + +__all__ = ("read", "parse", "to_dict", "index", "index_db", "write", "convert") + + +# dictionary of supported formats for parse() and read() +_ITERATOR_MAP = { + "blast-tab": ("BlastIO", "BlastTabParser"), + "blast-text": ("BlastIO", "BlastTextParser"), + "blast-xml": ("BlastIO", "BlastXmlParser"), + "blat-psl": ("BlatIO", "BlatPslParser"), + "exonerate-cigar": ("ExonerateIO", "ExonerateCigarParser"), + "exonerate-text": ("ExonerateIO", "ExonerateTextParser"), + "exonerate-vulgar": ("ExonerateIO", "ExonerateVulgarParser"), + "fasta-m10": ("FastaIO", "FastaM10Parser"), + "hhsuite2-text": ("HHsuiteIO", "Hhsuite2TextParser"), + "hhsuite3-text": ("HHsuiteIO", "Hhsuite2TextParser"), + "hmmer2-text": ("HmmerIO", "Hmmer2TextParser"), + "hmmer3-text": ("HmmerIO", "Hmmer3TextParser"), + "hmmer3-tab": ("HmmerIO", "Hmmer3TabParser"), + # for hmmer3-domtab, the specific program is part of the format name + # as we need it distinguish hit / target coordinates + "hmmscan3-domtab": ("HmmerIO", "Hmmer3DomtabHmmhitParser"), + "hmmsearch3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryParser"), + "interproscan-xml": ("InterproscanIO", "InterproscanXmlParser"), + "phmmer3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryParser"), +} + +# dictionary of supported formats for index() +_INDEXER_MAP = { + "blast-tab": ("BlastIO", "BlastTabIndexer"), + "blast-xml": ("BlastIO", "BlastXmlIndexer"), + "blat-psl": ("BlatIO", "BlatPslIndexer"), + "exonerate-cigar": ("ExonerateIO", "ExonerateCigarIndexer"), + "exonerate-text": ("ExonerateIO", "ExonerateTextIndexer"), + "exonerate-vulgar": ("ExonerateIO", "ExonerateVulgarIndexer"), + "fasta-m10": ("FastaIO", "FastaM10Indexer"), + "hmmer2-text": ("HmmerIO", "Hmmer2TextIndexer"), + "hmmer3-text": ("HmmerIO", "Hmmer3TextIndexer"), + "hmmer3-tab": ("HmmerIO", "Hmmer3TabIndexer"), + "hmmscan3-domtab": ("HmmerIO", "Hmmer3DomtabHmmhitIndexer"), + "hmmsearch3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"), + "phmmer3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryIndexer"), +} + +# dictionary of supported formats for write() +_WRITER_MAP = { + "blast-tab": ("BlastIO", "BlastTabWriter"), + "blast-xml": ("BlastIO", "BlastXmlWriter"), + "blat-psl": ("BlatIO", "BlatPslWriter"), + "hmmer3-tab": ("HmmerIO", "Hmmer3TabWriter"), + "hmmscan3-domtab": ("HmmerIO", "Hmmer3DomtabHmmhitWriter"), + "hmmsearch3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryWriter"), + "phmmer3-domtab": ("HmmerIO", "Hmmer3DomtabHmmqueryWriter"), +} + + +def parse(handle, format=None, **kwargs): + """Iterate over search tool output file as QueryResult objects. + + Arguments: + - handle - Handle to the file, or the filename as a string. + - format - Lower case string denoting one of the supported formats. + - kwargs - Format-specific keyword arguments. + + This function is used to iterate over each query in a given search output + file: + + >>> from Bio import SearchIO + >>> qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml') + >>> qresults + + >>> for qresult in qresults: + ... print("Search %s has %i hits" % (qresult.id, len(qresult))) + ... + Search 33211 has 100 hits + Search 33212 has 44 hits + Search 33213 has 95 hits + + Depending on the file format, ``parse`` may also accept additional keyword + argument(s) that modifies the behavior of the format parser. Here is a + simple example, where the keyword argument enables parsing of a commented + BLAST tabular output file: + + >>> from Bio import SearchIO + >>> for qresult in SearchIO.parse('Blast/mirna.tab', 'blast-tab', comments=True): + ... print("Search %s has %i hits" % (qresult.id, len(qresult))) + ... + Search 33211 has 100 hits + Search 33212 has 44 hits + Search 33213 has 95 hits + + """ + # get the iterator object and do error checking + iterator = get_processor(format, _ITERATOR_MAP) + + # HACK: force BLAST XML decoding to use utf-8 + handle_kwargs = {} + if format == "blast-xml": + handle_kwargs["encoding"] = "utf-8" + + # and start iterating + with as_handle(handle, **handle_kwargs) as source_file: + generator = iterator(source_file, **kwargs) + yield from generator + + +def read(handle, format=None, **kwargs): + """Turn a search output file containing one query into a single QueryResult. + + - handle - Handle to the file, or the filename as a string. + - format - Lower case string denoting one of the supported formats. + - kwargs - Format-specific keyword arguments. + + ``read`` is used for parsing search output files containing exactly one query: + + >>> from Bio import SearchIO + >>> qresult = SearchIO.read('Blast/xml_2226_blastp_004.xml', 'blast-xml') + >>> print("%s %s" % (qresult.id, qresult.description)) + ... + gi|11464971:4-101 pleckstrin [Mus musculus] + + If the given handle has no results, an exception will be raised: + + >>> from Bio import SearchIO + >>> qresult = SearchIO.read('Blast/tab_2226_tblastn_002.txt', 'blast-tab') + Traceback (most recent call last): + ... + ValueError: No query results found in handle + + Similarly, if the given handle has more than one results, an exception will + also be raised: + + >>> from Bio import SearchIO + >>> qresult = SearchIO.read('Blast/tab_2226_tblastn_001.txt', 'blast-tab') + Traceback (most recent call last): + ... + ValueError: More than one query results found in handle + + Like ``parse``, ``read`` may also accept keyword argument(s) depending on the + search output file format. + + """ + query_results = parse(handle, format, **kwargs) + + try: + query_result = next(query_results) + except StopIteration: + raise ValueError("No query results found in handle") from None + try: + next(query_results) + raise ValueError("More than one query results found in handle") + except StopIteration: + pass + + return query_result + + +def to_dict(qresults, key_function=None): + """Turn a QueryResult iterator or list into a dictionary. + + - qresults - Iterable returning QueryResult objects. + - key_function - Optional callback function which when given a + QueryResult object should return a unique key for the + dictionary. Defaults to using .id of the result. + + This function enables access of QueryResult objects from a single search + output file using its identifier. + + >>> from Bio import SearchIO + >>> qresults = SearchIO.parse('Blast/wnts.xml', 'blast-xml') + >>> search_dict = SearchIO.to_dict(qresults) + >>> list(search_dict) + ['gi|195230749:301-1383', 'gi|325053704:108-1166', ..., 'gi|53729353:216-1313'] + >>> search_dict['gi|156630997:105-1160'] + QueryResult(id='gi|156630997:105-1160', 5 hits) + + By default, the dictionary key is the QueryResult's string ID. This may be + changed by supplying a callback function that returns the desired identifier. + Here is an example using a function that removes the 'gi|' part in the + beginning of the QueryResult ID. + + >>> from Bio import SearchIO + >>> qresults = SearchIO.parse('Blast/wnts.xml', 'blast-xml') + >>> key_func = lambda qresult: qresult.id.split('|')[1] + >>> search_dict = SearchIO.to_dict(qresults, key_func) + >>> list(search_dict) + ['195230749:301-1383', '325053704:108-1166', ..., '53729353:216-1313'] + >>> search_dict['156630997:105-1160'] + QueryResult(id='gi|156630997:105-1160', 5 hits) + + Note that the callback function does not change the QueryResult's ID value. + It only changes the key value used to retrieve the associated QueryResult. + + As this function loads all QueryResult objects into memory, it may be + unsuitable for dealing with files containing many queries. In that case, it + is recommended that you use either ``index`` or ``index_db``. + + Since Python 3.7, the default dict class maintains key order, meaning + this dictionary will reflect the order of records given to it. For + CPython and PyPy, this was already implemented for Python 3.6, so + effectively you can always assume the record order is preserved. + """ + + def _default_key_function(rec): + return rec.id + + if key_function is None: + key_function = _default_key_function + + qdict = {} + for qresult in qresults: + key = key_function(qresult) + if key in qdict: + raise ValueError("Duplicate key %r" % key) + qdict[key] = qresult + return qdict + + +def index(filename, format=None, key_function=None, **kwargs): + """Indexes a search output file and returns a dictionary-like object. + + - filename - string giving name of file to be indexed + - format - Lower case string denoting one of the supported formats. + - key_function - Optional callback function which when given a + QueryResult should return a unique key for the dictionary. + - kwargs - Format-specific keyword arguments. + + Index returns a pseudo-dictionary object with QueryResult objects as its + values and a string identifier as its keys. The function is mainly useful + for dealing with large search output files, as it enables access to any + given QueryResult object much faster than using parse or read. + + Index works by storing in-memory the start locations of all queries in a + file. When a user requested access to the query, this function will jump + to its start position, parse the whole query, and return it as a + QueryResult object: + + >>> from Bio import SearchIO + >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml') + >>> search_idx + SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=None) + >>> sorted(search_idx) + ['gi|156630997:105-1160', 'gi|195230749:301-1383', ..., 'gi|53729353:216-1313'] + >>> search_idx['gi|195230749:301-1383'] + QueryResult(id='gi|195230749:301-1383', 5 hits) + >>> search_idx.close() + + If the file is BGZF compressed, this is detected automatically. Ordinary + GZIP files are not supported: + + >>> from Bio import SearchIO + >>> search_idx = SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml') + >>> search_idx + SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml', key_function=None) + >>> search_idx['gi|195230749:301-1383'] + QueryResult(id='gi|195230749:301-1383', 5 hits) + >>> search_idx.close() + + You can supply a custom callback function to alter the default identifier + string. This function should accept as its input the QueryResult ID string + and return a modified version of it. + + >>> from Bio import SearchIO + >>> key_func = lambda id: id.split('|')[1] + >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml', key_func) + >>> search_idx + SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function= at ...>) + >>> sorted(search_idx) + ['156630997:105-1160', ..., '371502086:108-1205', '53729353:216-1313'] + >>> search_idx['156630997:105-1160'] + QueryResult(id='gi|156630997:105-1160', 5 hits) + >>> search_idx.close() + + Note that the callback function does not change the QueryResult's ID value. + It only changes the key value used to retrieve the associated QueryResult. + + """ + if not isinstance(filename, str): + raise TypeError("Need a filename (not a handle)") + + from Bio.File import _IndexedSeqFileDict + + proxy_class = get_processor(format, _INDEXER_MAP) + repr = "SearchIO.index(%r, %r, key_function=%r)" % (filename, format, key_function) + return _IndexedSeqFileDict( + proxy_class(filename, **kwargs), key_function, repr, "QueryResult" + ) + + +def index_db(index_filename, filenames=None, format=None, key_function=None, **kwargs): + """Indexes several search output files into an SQLite database. + + - index_filename - The SQLite filename. + - filenames - List of strings specifying file(s) to be indexed, or when + indexing a single file this can be given as a string. + (optional if reloading an existing index, but must match) + - format - Lower case string denoting one of the supported formats. + (optional if reloading an existing index, but must match) + - key_function - Optional callback function which when given a + QueryResult identifier string should return a unique + key for the dictionary. + - kwargs - Format-specific keyword arguments. + + The ``index_db`` function is similar to ``index`` in that it indexes the start + position of all queries from search output files. The main difference is + instead of storing these indices in-memory, they are written to disk as an + SQLite database file. This allows the indices to persist between Python + sessions. This enables access to any queries in the file without any + indexing overhead, provided it has been indexed at least once. + + >>> from Bio import SearchIO + >>> idx_filename = ":memory:" # Use a real filename, this is in RAM only! + >>> db_idx = SearchIO.index_db(idx_filename, 'Blast/mirna.xml', 'blast-xml') + >>> sorted(db_idx) + ['33211', '33212', '33213'] + >>> db_idx['33212'] + QueryResult(id='33212', 44 hits) + >>> db_idx.close() + + ``index_db`` can also index multiple files and store them in the same + database, making it easier to group multiple search files and access them + from a single interface. + + >>> from Bio import SearchIO + >>> idx_filename = ":memory:" # Use a real filename, this is in RAM only! + >>> files = ['Blast/mirna.xml', 'Blast/wnts.xml'] + >>> db_idx = SearchIO.index_db(idx_filename, files, 'blast-xml') + >>> sorted(db_idx) + ['33211', '33212', '33213', 'gi|156630997:105-1160', ..., 'gi|53729353:216-1313'] + >>> db_idx['33212'] + QueryResult(id='33212', 44 hits) + >>> db_idx.close() + + One common example where this is helpful is if you had a large set of + query sequences (say ten thousand) which you split into ten query files + of one thousand sequences each in order to run as ten separate BLAST jobs + on a cluster. You could use ``index_db`` to index the ten BLAST output + files together for seamless access to all the results as one dictionary. + + Note that ':memory:' rather than an index filename tells SQLite to hold + the index database in memory. This is useful for quick tests, but using + the Bio.SearchIO.index(...) function instead would use less memory. + + BGZF compressed files are supported, and detected automatically. Ordinary + GZIP compressed files are not supported. + + See also Bio.SearchIO.index(), Bio.SearchIO.to_dict(), and the Python module + glob which is useful for building lists of files. + """ + # cast filenames to list if it's a string + # (can we check if it's a string or a generator?) + if isinstance(filenames, str): + filenames = [filenames] + + from Bio.File import _SQLiteManySeqFilesDict + + repr = "SearchIO.index_db(%r, filenames=%r, format=%r, key_function=%r, ...)" % ( + index_filename, + filenames, + format, + key_function, + ) + + def proxy_factory(format, filename=None): + """Given a filename returns proxy object, else boolean if format OK.""" + if filename: + return get_processor(format, _INDEXER_MAP)(filename, **kwargs) + else: + return format in _INDEXER_MAP + + return _SQLiteManySeqFilesDict( + index_filename, filenames, proxy_factory, format, key_function, repr + ) + + +def write(qresults, handle, format=None, **kwargs): + """Write QueryResult objects to a file in the given format. + + - qresults - An iterator returning QueryResult objects or a single + QueryResult object. + - handle - Handle to the file, or the filename as a string. + - format - Lower case string denoting one of the supported formats. + - kwargs - Format-specific keyword arguments. + + The ``write`` function writes QueryResult object(s) into the given output + handle / filename. You can supply it with a single QueryResult object or an + iterable returning one or more QueryResult objects. In both cases, the + function will return a tuple of four values: the number of QueryResult, Hit, + HSP, and HSPFragment objects it writes to the output file:: + + from Bio import SearchIO + qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml') + SearchIO.write(qresults, 'results.tab', 'blast-tab') + (3, 239, 277, 277) + + The output of different formats may be adjusted using the format-specific + keyword arguments. Here is an example that writes BLAT PSL output file with + a header:: + + from Bio import SearchIO + qresults = SearchIO.parse('Blat/psl_34_001.psl', 'blat-psl') + SearchIO.write(qresults, 'results.tab', 'blat-psl', header=True) + (2, 13, 22, 26) + + """ + # turn qresults into an iterator if it's a single QueryResult object + if isinstance(qresults, QueryResult): + qresults = iter([qresults]) + else: + qresults = iter(qresults) + + # get the writer object and do error checking + writer_class = get_processor(format, _WRITER_MAP) + + # write to the handle + with as_handle(handle, "w") as target_file: + writer = writer_class(target_file, **kwargs) + # count how many qresults, hits, and hsps + qresult_count, hit_count, hsp_count, frag_count = writer.write_file(qresults) + + return qresult_count, hit_count, hsp_count, frag_count + + +def convert(in_file, in_format, out_file, out_format, in_kwargs=None, out_kwargs=None): + """Convert between two search output formats, return number of records. + + - in_file - Handle to the input file, or the filename as string. + - in_format - Lower case string denoting the format of the input file. + - out_file - Handle to the output file, or the filename as string. + - out_format - Lower case string denoting the format of the output file. + - in_kwargs - Dictionary of keyword arguments for the input function. + - out_kwargs - Dictionary of keyword arguments for the output function. + + The convert function is a shortcut function for ``parse`` and ``write``. It has + the same return type as ``write``. Format-specific arguments may be passed to + the convert function, but only as dictionaries. + + Here is an example of using ``convert`` to convert from a BLAST+ XML file + into a tabular file with comments:: + + from Bio import SearchIO + in_file = 'Blast/mirna.xml' + in_fmt = 'blast-xml' + out_file = 'results.tab' + out_fmt = 'blast-tab' + out_kwarg = {'comments': True} + SearchIO.convert(in_file, in_fmt, out_file, out_fmt, out_kwargs=out_kwarg) + (3, 239, 277, 277) + + Given that different search output file provide different statistics and + different level of details, the convert function is limited only to + converting formats that have the same statistics and for conversion to + formats with the same level of detail, or less. + + For example, converting from a BLAST+ XML output to a HMMER table file + is not possible, as these are two search programs with different kinds of + statistics. In theory, you may provide the necessary values required by the + HMMER table file (e.g. conditional e-values, envelope coordinates, etc). + However, these values are likely to hold little meaning as they are not true + HMMER-computed values. + + Another example is converting from BLAST+ XML to BLAST+ tabular file. This + is possible, as BLAST+ XML provide all the values necessary to create a + BLAST+ tabular file. However, the reverse conversion may not be possible. + There are more details covered in the XML file that are not found in a + tabular file (e.g. the lambda and kappa values) + + """ + if in_kwargs is None: + in_kwargs = {} + if out_kwargs is None: + out_kwargs = {} + + qresults = parse(in_file, in_format, **in_kwargs) + return write(qresults, out_file, out_format, **out_kwargs) + + +# if not used as a module, run the doctest +if __name__ == "__main__": + from Bio._utils import run_doctest + + run_doctest() diff --git a/code/lib/Bio/SearchIO/__pycache__/BlatIO.cpython-37.pyc b/code/lib/Bio/SearchIO/__pycache__/BlatIO.cpython-37.pyc new file mode 100644 index 0000000..bfed650 Binary files /dev/null and b/code/lib/Bio/SearchIO/__pycache__/BlatIO.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/__pycache__/FastaIO.cpython-37.pyc b/code/lib/Bio/SearchIO/__pycache__/FastaIO.cpython-37.pyc new file mode 100644 index 0000000..fdb3870 Binary files /dev/null and b/code/lib/Bio/SearchIO/__pycache__/FastaIO.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SearchIO/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..1f04b72 Binary files /dev/null and b/code/lib/Bio/SearchIO/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/__pycache__/_index.cpython-37.pyc b/code/lib/Bio/SearchIO/__pycache__/_index.cpython-37.pyc new file mode 100644 index 0000000..9d3448a Binary files /dev/null and b/code/lib/Bio/SearchIO/__pycache__/_index.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/__pycache__/_utils.cpython-37.pyc b/code/lib/Bio/SearchIO/__pycache__/_utils.cpython-37.pyc new file mode 100644 index 0000000..20cf9af Binary files /dev/null and b/code/lib/Bio/SearchIO/__pycache__/_utils.cpython-37.pyc differ diff --git a/code/lib/Bio/SearchIO/_index.py b/code/lib/Bio/SearchIO/_index.py new file mode 100644 index 0000000..0fa5b04 --- /dev/null +++ b/code/lib/Bio/SearchIO/_index.py @@ -0,0 +1,34 @@ +# Copyright 2012 by Wibowo Arindrarto. All rights reserved. +# Revisions copyright 2012-2016 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""Custom indexing for Bio.SearchIO objects.""" + +from io import StringIO + +from Bio.File import _IndexedSeqFileProxy, _open_for_random_access + + +class SearchIndexer(_IndexedSeqFileProxy): + """Base class for file format specific random access. + + Subclasses for each file format should define '_parser' and optionally + 'get_raw' methods. + """ + + def __init__(self, filename, **kwargs): + """Initialize the class.""" + self._handle = _open_for_random_access(filename) + self._kwargs = kwargs + + def _parse(self, handle): + """Pass handle and arguments to the next iterable (PRIVATE).""" + return next(iter(self._parser(handle, **self._kwargs))) + + def get(self, offset): + """Get offset and convert it from bytes to string.""" + return self._parse(StringIO(self.get_raw(offset).decode())) diff --git a/code/lib/Bio/SearchIO/_legacy/NCBIStandalone.py b/code/lib/Bio/SearchIO/_legacy/NCBIStandalone.py new file mode 100644 index 0000000..378da39 --- /dev/null +++ b/code/lib/Bio/SearchIO/_legacy/NCBIStandalone.py @@ -0,0 +1,1953 @@ +# Copyright 1999-2000 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# Patches by Mike Poidinger to support multiple databases. +# Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 + +"""Code for calling standalone BLAST and parsing plain text output (DEPRECATED). + +Rather than parsing the human readable plain text BLAST output (which seems to +change with every update to BLAST), we and the NBCI recommend you parse the +XML output instead. The plain text parser in this module still works at the +time of writing, but is considered obsolete and updating it to cope with the +latest versions of BLAST is not a priority for us. + +This module also provides code to work with the "legacy" standalone version of +NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of +the same name. These functions are very limited for dealing with the output as +files rather than handles, for which the wrappers in Bio.Blast.Applications are +preferred. Furthermore, the NCBI themselves regard these command line tools as +"legacy", and encourage using the new BLAST+ tools instead. Biopython has +wrappers for these under Bio.Blast.Applications (see the tutorial). +""" + +import re + +from io import StringIO +from Bio.SearchIO._legacy.ParserSupport import ( + UndoHandle, + AbstractParser, + AbstractConsumer, + read_and_call, + read_and_call_until, + read_and_call_while, + attempt_read_and_call, + is_blank_line, + safe_peekline, + safe_readline, +) +from Bio.Blast import Record + +from Bio import BiopythonWarning +import warnings + +_score_e_re = re.compile(r"Score +E") + + +class LowQualityBlastError(Exception): + """Error caused by running a low quality sequence through BLAST. + + When low quality sequences (like GenBank entries containing only + stretches of a single nucleotide) are BLASTed, they will result in + BLAST generating an error and not being able to perform the BLAST. + search. This error should be raised for the BLAST reports produced + in this case. + """ + + pass + + +class ShortQueryBlastError(Exception): + """Error caused by running a short query sequence through BLAST. + + If the query sequence is too short, BLAST outputs warnings and errors:: + + Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. + [blastall] ERROR: [000.000] AT1G08320: Blast: + [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize + done + + This exception is raised when that condition is detected. + """ + + pass + + +class _Scanner: + """Scan BLAST output from blastall or blastpgp. + + Tested with blastall and blastpgp v2.0.10, v2.0.11 + + Methods: + - feed Feed data into the scanner. + + """ + + def __init__(self): + """Raise warning that this module is outdated.""" + warnings.warn( + "Parsing BLAST plain text output file is not a well supported" + " functionality anymore. Consider generating your BLAST output for parsing" + " as XML or tabular format instead.", + BiopythonWarning, + ) + + def feed(self, handle, consumer): + """Feed in a BLAST report for scanning. + + Arguments: + - handle is a file-like object that contains the BLAST report. + - consumer is a Consumer object that will receive events as the + report is scanned. + """ + if isinstance(handle, UndoHandle): + uhandle = handle + else: + uhandle = UndoHandle(handle) + + # Try to fast-forward to the beginning of the blast report. + read_and_call_until(uhandle, consumer.noevent, contains="BLAST") + # Now scan the BLAST report. + self._scan_header(uhandle, consumer) + self._scan_rounds(uhandle, consumer) + self._scan_database_report(uhandle, consumer) + self._scan_parameters(uhandle, consumer) + + def _scan_header(self, uhandle, consumer): + # BLASTP 2.0.10 [Aug-26-1999] + # + # + # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf + # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), + # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea + # programs", Nucleic Acids Res. 25:3389-3402. + # + # Query= test + # (140 letters) + # + # Database: sdqib40-1.35.seg.fa + # 1323 sequences; 223,339 total letters + # + # ======================================================== + # This next example is from the online version of Blast, + # note there are TWO references, an RID line, and also + # the database is BEFORE the query line. + # Note there possibleuse of non-ASCII in the author names. + # ======================================================== + # + # BLASTP 2.2.15 [Oct-15-2006] + # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, + # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman + # (1997), "Gapped BLAST and PSI-BLAST: a new generation of + # protein database search programs", Nucleic Acids Res. 25:3389-3402. + # + # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei + # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and + # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST + # protein database searches with composition-based statistics + # and other refinements", Nucleic Acids Res. 29:2994-3005. + # + # RID: 1166022616-19998-65316425856.BLASTQ1 + # + # + # Database: All non-redundant GenBank CDS + # translations+PDB+SwissProt+PIR+PRF excluding environmental samples + # 4,254,166 sequences; 1,462,033,012 total letters + # Query= gi:16127998 + # Length=428 + # + + consumer.start_header() + + read_and_call(uhandle, consumer.version, contains="BLAST") + read_and_call_while(uhandle, consumer.noevent, blank=1) + + # There might be a
 line, for qblast output.
+        attempt_read_and_call(uhandle, consumer.noevent, start="
")
+
+        # Read the reference(s)
+        while attempt_read_and_call(uhandle, consumer.reference, start="Reference"):
+            # References are normally multiline terminated by a blank line
+            # (or, based on the old code, the RID line)
+            while True:
+                line = uhandle.readline()
+                if is_blank_line(line):
+                    consumer.noevent(line)
+                    break
+                elif line.startswith("RID"):
+                    break
+                else:
+                    # More of the reference
+                    consumer.reference(line)
+
+        # Deal with the optional RID: ...
+        read_and_call_while(uhandle, consumer.noevent, blank=1)
+        attempt_read_and_call(uhandle, consumer.reference, start="RID:")
+        read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+        # blastpgp may have a reference for compositional score matrix
+        # adjustment (see Bug 2502):
+        if attempt_read_and_call(uhandle, consumer.reference, start="Reference"):
+            read_and_call_until(uhandle, consumer.reference, blank=1)
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+        # blastpgp has a Reference for composition-based statistics.
+        if attempt_read_and_call(uhandle, consumer.reference, start="Reference"):
+            read_and_call_until(uhandle, consumer.reference, blank=1)
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+        line = uhandle.peekline()
+        assert line.strip() != ""
+        assert not line.startswith("RID:")
+        if line.startswith("Query="):
+            # This is an old style query then database...
+
+            # Read the Query lines and the following blank line.
+            read_and_call(uhandle, consumer.query_info, start="Query=")
+            read_and_call_until(uhandle, consumer.query_info, blank=1)
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+            # Read the database lines and the following blank line.
+            read_and_call_until(uhandle, consumer.database_info, end="total letters")
+            read_and_call(uhandle, consumer.database_info, contains="sequences")
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+        elif line.startswith("Database:"):
+            # This is a new style database then query...
+            read_and_call_until(uhandle, consumer.database_info, end="total letters")
+            read_and_call(uhandle, consumer.database_info, contains="sequences")
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+            # Read the Query lines and the following blank line.
+            # Or, on BLAST 2.2.22+ there is no blank link - need to spot
+            # the "... Score     E" line instead.
+            read_and_call(uhandle, consumer.query_info, start="Query=")
+            # BLAST 2.2.25+ has a blank line before Length=
+            read_and_call_until(uhandle, consumer.query_info, start="Length=")
+            while True:
+                line = uhandle.peekline()
+                if not line.strip() or _score_e_re.search(line) is not None:
+                    break
+                # It is more of the query (and its length)
+                read_and_call(uhandle, consumer.query_info)
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+        else:
+            raise ValueError("Invalid header?")
+
+        consumer.end_header()
+
+    def _scan_rounds(self, uhandle, consumer):
+        # Scan a bunch of rounds.
+        # Each round begins with either a "Searching......" line
+        # or a 'Score     E' line followed by descriptions and alignments.
+        # The email server doesn't give the "Searching....." line.
+        # If there is no 'Searching.....' line then you'll first see a
+        # 'Results from round' line
+
+        while not self._eof(uhandle):
+            line = safe_peekline(uhandle)
+            if (
+                not line.startswith("Searching")
+                and not line.startswith("Results from round")
+                and _score_e_re.search(line) is None
+                and "No hits found" not in line
+            ):
+                break
+            self._scan_descriptions(uhandle, consumer)
+            self._scan_alignments(uhandle, consumer)
+
+    def _scan_descriptions(self, uhandle, consumer):
+        # Searching..................................................done
+        # Results from round 2
+        #
+        #
+        #                                                                    Sc
+        # Sequences producing significant alignments:                        (b
+        # Sequences used in model and found again:
+        #
+        # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ...
+        # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B...
+        # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)]
+        #
+        # Sequences not found previously or not previously below threshold:
+        #
+        # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia]
+        # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb...
+        #
+
+        # If PSI-BLAST, may also have:
+        #
+        # CONVERGED!
+
+        consumer.start_descriptions()
+
+        # Read 'Searching'
+        # This line seems to be missing in BLASTN 2.1.2 (others?)
+        attempt_read_and_call(uhandle, consumer.noevent, start="Searching")
+
+        # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here.
+        # If this happens, the handle will yield no more information.
+        if not uhandle.peekline():
+            raise ValueError(
+                "Unexpected end of blast report. Looks suspiciously like a PSI-BLAST crash."
+            )
+
+        # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here:
+        # Searching[blastall] WARNING:  [000.000]  AT1G08320: SetUpBlastSearch
+        # [blastall] ERROR:  [000.000]  AT1G08320: Blast:
+        # [blastall] ERROR:  [000.000]  AT1G08320: Blast: Query must be at leas
+        # done
+        # Reported by David Weisman.
+        # Check for these error lines and ignore them for now.  Let
+        # the BlastErrorParser deal with them.
+        line = uhandle.peekline()
+        if "ERROR:" in line or line.startswith("done"):
+            read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
+            read_and_call(uhandle, consumer.noevent, start="done")
+
+        # Check to see if this is PSI-BLAST.
+        # If it is, the 'Searching' line will be followed by:
+        # (version 2.0.10)
+        #     Searching.............................
+        #     Results from round 2
+        # or (version 2.0.11)
+        #     Searching.............................
+        #
+        #
+        #     Results from round 2
+
+        # Skip a bunch of blank lines.
+        read_and_call_while(uhandle, consumer.noevent, blank=1)
+        # Check for the results line if it's there.
+        if attempt_read_and_call(uhandle, consumer.round, start="Results"):
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+        # Three things can happen here:
+        # 1.  line contains 'Score     E'
+        # 2.  line contains "No hits found"
+        # 3.  no descriptions
+        # The first one begins a bunch of descriptions.  The last two
+        # indicates that no descriptions follow, and we should go straight
+        # to the alignments.
+        if not attempt_read_and_call(
+            uhandle, consumer.description_header, has_re=_score_e_re
+        ):
+            # Either case 2 or 3.  Look for "No hits found".
+            attempt_read_and_call(uhandle, consumer.no_hits, contains="No hits found")
+            try:
+                read_and_call_while(uhandle, consumer.noevent, blank=1)
+            except ValueError as err:
+                if str(err) != "Unexpected end of stream.":
+                    raise
+
+            consumer.end_descriptions()
+            # Stop processing.
+            return
+
+        # Read the score header lines
+        read_and_call(uhandle, consumer.description_header, start="Sequences producing")
+
+        # If PSI-BLAST, read the 'Sequences used in model' line.
+        attempt_read_and_call(
+            uhandle, consumer.model_sequences, start="Sequences used in model"
+        )
+        read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+        # In BLAT, rather than a "No hits found" line, we just
+        # get no descriptions (and no alignments). This can be
+        # spotted because the next line is the database block:
+        if safe_peekline(uhandle).startswith("  Database:"):
+            consumer.end_descriptions()
+            # Stop processing.
+            return
+
+        # Read the descriptions and the following blank lines, making
+        # sure that there are descriptions.
+        if not uhandle.peekline().startswith("Sequences not found"):
+            read_and_call_until(uhandle, consumer.description, blank=1)
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+        # If PSI-BLAST, read the 'Sequences not found' line followed
+        # by more descriptions.  However, I need to watch out for the
+        # case where there were no sequences not found previously, in
+        # which case there will be no more descriptions.
+        if attempt_read_and_call(
+            uhandle, consumer.nonmodel_sequences, start="Sequences not found"
+        ):
+            # Read the descriptions and the following blank lines.
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+            line = safe_peekline(uhandle)
+            # Brad -- added check for QUERY. On some PSI-BLAST outputs
+            # there will be a 'Sequences not found' line followed by no
+            # descriptions. Check for this case since the first thing you'll
+            # get is a blank line and then 'QUERY'
+            if (
+                not line.startswith("CONVERGED")
+                and line[0] != ">"
+                and not line.startswith("QUERY")
+            ):
+                read_and_call_until(uhandle, consumer.description, blank=1)
+                read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+        attempt_read_and_call(uhandle, consumer.converged, start="CONVERGED")
+        read_and_call_while(uhandle, consumer.noevent, blank=1)
+
+        consumer.end_descriptions()
+
+    def _scan_alignments(self, uhandle, consumer):
+        if self._eof(uhandle):
+            return
+
+        # qblast inserts a helpful line here.
+        attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS")
+
+        # First, check to see if I'm at the database report.
+        line = safe_peekline(uhandle)
+        if not line:
+            # EOF
+            return
+        elif line.startswith("  Database") or line.startswith("Lambda"):
+            return
+        elif line[0] == ">":
+            # XXX make a better check here between pairwise and masterslave
+            self._scan_pairwise_alignments(uhandle, consumer)
+        elif line.startswith("Effective"):
+            return
+        else:
+            # XXX put in a check to make sure I'm in a masterslave alignment
+            self._scan_masterslave_alignment(uhandle, consumer)
+
+    def _scan_pairwise_alignments(self, uhandle, consumer):
+        while not self._eof(uhandle):
+            line = safe_peekline(uhandle)
+            if line[0] != ">":
+                break
+            self._scan_one_pairwise_alignment(uhandle, consumer)
+
+    def _scan_one_pairwise_alignment(self, uhandle, consumer):
+        if self._eof(uhandle):
+            return
+        consumer.start_alignment()
+
+        self._scan_alignment_header(uhandle, consumer)
+
+        # Scan a bunch of score/alignment pairs.
+        while True:
+            if self._eof(uhandle):
+                # Shouldn't have issued that _scan_alignment_header event...
+                break
+            line = safe_peekline(uhandle)
+            if not line.startswith(" Score"):
+                break
+            self._scan_hsp(uhandle, consumer)
+        consumer.end_alignment()
+
+    def _scan_alignment_header(self, uhandle, consumer):
+        # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus
+        #           stearothermophilus]
+        #           Length = 81
+        #
+        # Or, more recently with different white space:
+        #
+        # >gi|15799684|ref|NP_285696.1| threonine synthase ...
+        #  gi|15829258|ref|NP_308031.1| threonine synthase
+        #  ...
+        # Length=428
+        read_and_call(uhandle, consumer.title, start=">")
+        while True:
+            line = safe_readline(uhandle)
+            if line.lstrip().startswith(("Length =", "Length=")):
+                consumer.length(line)
+                break
+            elif is_blank_line(line):
+                # Check to make sure I haven't missed the Length line
+                raise ValueError("I missed the Length in an alignment header")
+            consumer.title(line)
+
+        # Older versions of BLAST will have a line with some spaces.
+        # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line.
+        if not attempt_read_and_call(uhandle, consumer.noevent, start="          "):
+            read_and_call(uhandle, consumer.noevent, blank=1)
+
+    def _scan_hsp(self, uhandle, consumer):
+        consumer.start_hsp()
+        self._scan_hsp_header(uhandle, consumer)
+        self._scan_hsp_alignment(uhandle, consumer)
+        consumer.end_hsp()
+
+    def _scan_hsp_header(self, uhandle, consumer):
+        #  Score = 22.7 bits (47), Expect = 2.5
+        #  Identities = 10/36 (27%), Positives = 18/36 (49%)
+        #  Strand = Plus / Plus
+        #  Frame = +3
+        #
+
+        read_and_call(uhandle, consumer.score, start=" Score")
+        read_and_call(uhandle, consumer.identities, start=" Identities")
+        # BLASTN
+        attempt_read_and_call(uhandle, consumer.strand, start=" Strand")
+        # BLASTX, TBLASTN, TBLASTX
+        attempt_read_and_call(uhandle, consumer.frame, start=" Frame")
+        read_and_call(uhandle, consumer.noevent, blank=1)
+
+    def _scan_hsp_alignment(self, uhandle, consumer):
+        # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF
+        #           GRGVS+         TC    Y  + + V GGG+ + EE   L     +   I R+
+        # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG
+        #
+        # Query: 64 AEKILIKR 71
+        #              I +K
+        # Sbjct: 70 PNIIQLKD 77
+        #
+
+        while True:
+            # Blastn adds an extra line filled with spaces before Query
+            attempt_read_and_call(uhandle, consumer.noevent, start="     ")
+            read_and_call(uhandle, consumer.query, start="Query")
+            read_and_call(uhandle, consumer.align, start="     ")
+            read_and_call(uhandle, consumer.sbjct, start="Sbjct")
+            try:
+                read_and_call_while(uhandle, consumer.noevent, blank=1)
+            except ValueError as err:
+                if str(err) != "Unexpected end of stream.":
+                    raise
+                # End of File (well, it looks like it with recent versions
+                # of BLAST for multiple queries after the Iterator class
+                # has broken up the whole file into chunks).
+                break
+            line = safe_peekline(uhandle)
+            # Alignment continues if I see a 'Query' or the spaces for Blastn.
+            if not (line.startswith("Query") or line.startswith("     ")):
+                break
+
+    def _scan_masterslave_alignment(self, uhandle, consumer):
+        consumer.start_alignment()
+        while True:
+            line = safe_readline(uhandle)
+            # Check to see whether I'm finished reading the alignment.
+            # This is indicated by 1) database section, 2) next psi-blast
+            # round, which can also be a 'Results from round' if no
+            # searching line is present
+            # patch by chapmanb
+            if line.startswith("Searching") or line.startswith("Results from round"):
+                uhandle.saveline(line)
+                break
+            elif line.startswith("  Database"):
+                uhandle.saveline(line)
+                break
+            elif is_blank_line(line):
+                consumer.noevent(line)
+            else:
+                consumer.multalign(line)
+        read_and_call_while(uhandle, consumer.noevent, blank=1)
+        consumer.end_alignment()
+
+    def _eof(self, uhandle):
+        try:
+            line = safe_peekline(uhandle)
+        except ValueError as err:
+            if str(err) != "Unexpected end of stream.":
+                raise
+            line = ""
+        return not line
+
+    def _scan_database_report(self, uhandle, consumer):
+        #   Database: sdqib40-1.35.seg.fa
+        #     Posted date:  Nov 1, 1999  4:25 PM
+        #   Number of letters in database: 223,339
+        #   Number of sequences in database:  1323
+        #
+        # Lambda     K      H
+        #    0.322    0.133    0.369
+        #
+        # Gapped
+        # Lambda     K      H
+        #    0.270   0.0470    0.230
+        #
+        ##########################################
+        # Or, more recently Blast 2.2.15 gives less blank lines
+        ##########################################
+        #   Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding
+        # environmental samples
+        #     Posted date:  Dec 12, 2006  5:51 PM
+        #   Number of letters in database: 667,088,753
+        #   Number of sequences in database:  2,094,974
+        # Lambda     K      H
+        #    0.319    0.136    0.395
+        # Gapped
+        # Lambda     K      H
+        #    0.267   0.0410    0.140
+
+        if self._eof(uhandle):
+            return
+
+        consumer.start_database_report()
+
+        # Subset of the database(s) listed below
+        #    Number of letters searched: 562,618,960
+        #    Number of sequences searched:  228,924
+        if attempt_read_and_call(uhandle, consumer.noevent, start="  Subset"):
+            read_and_call(uhandle, consumer.noevent, contains="letters")
+            read_and_call(uhandle, consumer.noevent, contains="sequences")
+            read_and_call(uhandle, consumer.noevent, start="  ")
+
+        # Sameet Mehta reported seeing output from BLASTN 2.2.9 that
+        # was missing the "Database" stanza completely.
+        while attempt_read_and_call(uhandle, consumer.database, start="  Database"):
+            # BLAT output ends abruptly here, without any of the other
+            # information.  Check to see if this is the case.  If so,
+            # then end the database report here gracefully.
+            if not uhandle.peekline().strip() or uhandle.peekline().startswith("BLAST"):
+                consumer.end_database_report()
+                return
+
+            # Database can span multiple lines.
+            read_and_call_until(uhandle, consumer.database, start="    Posted")
+            read_and_call(uhandle, consumer.posted_date, start="    Posted")
+            read_and_call(
+                uhandle, consumer.num_letters_in_database, start="  Number of letters"
+            )
+            read_and_call(
+                uhandle,
+                consumer.num_sequences_in_database,
+                start="  Number of sequences",
+            )
+            # There may not be a line starting with spaces...
+            attempt_read_and_call(uhandle, consumer.noevent, start="  ")
+
+            line = safe_readline(uhandle)
+            uhandle.saveline(line)
+            if "Lambda" in line:
+                break
+
+        try:
+            read_and_call(uhandle, consumer.noevent, start="Lambda")
+            read_and_call(uhandle, consumer.ka_params)
+        except Exception:  # TODO: ValueError, AttributeError?
+            pass
+
+        # This blank line is optional:
+        attempt_read_and_call(uhandle, consumer.noevent, blank=1)
+
+        # not BLASTP
+        attempt_read_and_call(uhandle, consumer.gapped, start="Gapped")
+        # not TBLASTX
+        if attempt_read_and_call(uhandle, consumer.noevent, start="Lambda"):
+            read_and_call(uhandle, consumer.ka_params_gap)
+
+        # Blast 2.2.4 can sometimes skip the whole parameter section.
+        # Thus, I need to be careful not to read past the end of the
+        # file.
+        try:
+            read_and_call_while(uhandle, consumer.noevent, blank=1)
+        except ValueError as x:
+            if str(x) != "Unexpected end of stream.":
+                raise
+        consumer.end_database_report()
+
+    def _scan_parameters(self, uhandle, consumer):
+        # Matrix: BLOSUM62
+        # Gap Penalties: Existence: 11, Extension: 1
+        # Number of Hits to DB: 50604
+        # Number of Sequences: 1323
+        # Number of extensions: 1526
+        # Number of successful extensions: 6
+        # Number of sequences better than 10.0: 5
+        # Number of HSP's better than 10.0 without gapping: 5
+        # Number of HSP's successfully gapped in prelim test: 0
+        # Number of HSP's that attempted gapping in prelim test: 1
+        # Number of HSP's gapped (non-prelim): 5
+        # length of query: 140
+        # length of database: 223,339
+        # effective HSP length: 39
+        # effective length of query: 101
+        # effective length of database: 171,742
+        # effective search space: 17345942
+        # effective search space used: 17345942
+        # T: 11
+        # A: 40
+        # X1: 16 ( 7.4 bits)
+        # X2: 38 (14.8 bits)
+        # X3: 64 (24.9 bits)
+        # S1: 41 (21.9 bits)
+        # S2: 42 (20.8 bits)
+        ##########################################
+        # Or, more recently Blast(x) 2.2.15 gives
+        ##########################################
+        # Matrix: BLOSUM62
+        # Gap Penalties: Existence: 11, Extension: 1
+        # Number of Sequences: 4535438
+        # Number of Hits to DB: 2,588,844,100
+        # Number of extensions: 60427286
+        # Number of successful extensions: 126433
+        # Number of sequences better than  2.0: 30
+        # Number of HSP's gapped: 126387
+        # Number of HSP's successfully gapped: 35
+        # Length of query: 291
+        # Length of database: 1,573,298,872
+        # Length adjustment: 130
+        # Effective length of query: 161
+        # Effective length of database: 983,691,932
+        # Effective search space: 158374401052
+        # Effective search space used: 158374401052
+        # Neighboring words threshold: 12
+        # Window for multiple hits: 40
+        # X1: 16 ( 7.3 bits)
+        # X2: 38 (14.6 bits)
+        # X3: 64 (24.7 bits)
+        # S1: 41 (21.7 bits)
+        # S2: 32 (16.9 bits)
+
+        # Blast 2.2.4 can sometimes skip the whole parameter section.
+        # BLAT also skips the whole parameter section.
+        # Thus, check to make sure that the parameter section really
+        # exists.
+        if not uhandle.peekline().strip():
+            return
+
+        # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and
+        # "Number of Sequences" lines.
+        consumer.start_parameters()
+
+        # Matrix line may be missing in BLASTN 2.2.9
+        attempt_read_and_call(uhandle, consumer.matrix, start="Matrix")
+        # not TBLASTX
+        attempt_read_and_call(uhandle, consumer.gap_penalties, start="Gap")
+        attempt_read_and_call(
+            uhandle, consumer.num_sequences, start="Number of Sequences"
+        )
+        attempt_read_and_call(uhandle, consumer.num_hits, start="Number of Hits")
+        attempt_read_and_call(
+            uhandle, consumer.num_sequences, start="Number of Sequences"
+        )
+        attempt_read_and_call(
+            uhandle, consumer.num_extends, start="Number of extensions"
+        )
+        attempt_read_and_call(
+            uhandle, consumer.num_good_extends, start="Number of successful"
+        )
+        attempt_read_and_call(
+            uhandle, consumer.num_seqs_better_e, start="Number of sequences"
+        )
+
+        # not BLASTN, TBLASTX
+        if attempt_read_and_call(
+            uhandle, consumer.hsps_no_gap, start="Number of HSP's better"
+        ):
+            # BLASTN 2.2.9
+            if attempt_read_and_call(
+                uhandle, consumer.noevent, start="Number of HSP's gapped:"
+            ):
+                read_and_call(
+                    uhandle, consumer.noevent, start="Number of HSP's successfully"
+                )
+                # This is omitted in 2.2.15
+                attempt_read_and_call(
+                    uhandle, consumer.noevent, start="Number of extra gapped extensions"
+                )
+            else:
+                read_and_call(
+                    uhandle,
+                    consumer.hsps_prelim_gapped,
+                    start="Number of HSP's successfully",
+                )
+                read_and_call(
+                    uhandle,
+                    consumer.hsps_prelim_gap_attempted,
+                    start="Number of HSP's that",
+                )
+                read_and_call(
+                    uhandle, consumer.hsps_gapped, start="Number of HSP's gapped"
+                )
+        # e.g. BLASTX 2.2.15 where the "better" line is missing
+        elif attempt_read_and_call(
+            uhandle, consumer.noevent, start="Number of HSP's gapped"
+        ):
+            read_and_call(
+                uhandle, consumer.noevent, start="Number of HSP's successfully"
+            )
+
+        # not in blastx 2.2.1
+        attempt_read_and_call(
+            uhandle, consumer.query_length, has_re=re.compile(r"[Ll]ength of query")
+        )
+        # Not in BLASTX 2.2.22+
+        attempt_read_and_call(
+            uhandle,
+            consumer.database_length,
+            has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"),
+        )
+
+        # BLASTN 2.2.9
+        attempt_read_and_call(uhandle, consumer.noevent, start="Length adjustment")
+        attempt_read_and_call(
+            uhandle, consumer.effective_hsp_length, start="effective HSP"
+        )
+        # Not in blastx 2.2.1
+        attempt_read_and_call(
+            uhandle,
+            consumer.effective_query_length,
+            has_re=re.compile(r"[Ee]ffective length of query"),
+        )
+
+        # This is not in BLASTP 2.2.15
+        attempt_read_and_call(
+            uhandle,
+            consumer.effective_database_length,
+            has_re=re.compile(r"[Ee]ffective length of \s*[Dd]atabase"),
+        )
+        # Not in blastx 2.2.1, added a ':' to distinguish between
+        # this and the 'effective search space used' line
+        attempt_read_and_call(
+            uhandle,
+            consumer.effective_search_space,
+            has_re=re.compile(r"[Ee]ffective search space:"),
+        )
+        # Does not appear in BLASTP 2.0.5
+        attempt_read_and_call(
+            uhandle,
+            consumer.effective_search_space_used,
+            has_re=re.compile(r"[Ee]ffective search space used"),
+        )
+
+        # BLASTX, TBLASTN, TBLASTX
+        attempt_read_and_call(uhandle, consumer.frameshift, start="frameshift")
+
+        # not in BLASTN 2.2.9
+        attempt_read_and_call(uhandle, consumer.threshold, start="T")
+        # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12"
+        attempt_read_and_call(
+            uhandle, consumer.threshold, start="Neighboring words threshold"
+        )
+
+        # not in BLASTX 2.2.15
+        attempt_read_and_call(uhandle, consumer.window_size, start="A")
+        # get this instead: "Window for multiple hits: 40"
+        attempt_read_and_call(
+            uhandle, consumer.window_size, start="Window for multiple hits"
+        )
+
+        # not in BLASTX 2.2.22+
+        attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start="X1")
+        # not TBLASTN
+        attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start="X2")
+
+        # not BLASTN, TBLASTX
+        attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, start="X3")
+
+        # not TBLASTN
+        attempt_read_and_call(uhandle, consumer.gap_trigger, start="S1")
+        # not in blastx 2.2.1
+        # first we make sure we have additional lines to work with, if
+        # not then the file is done and we don't have a final S2
+        if not is_blank_line(uhandle.peekline(), allow_spaces=1):
+            read_and_call(uhandle, consumer.blast_cutoff, start="S2")
+
+        consumer.end_parameters()
+
+
+class BlastParser(AbstractParser):
+    """Parses BLAST data into a Record.Blast object."""
+
+    def __init__(self):
+        """Initialize the class."""
+        self._scanner = _Scanner()
+        self._consumer = _BlastConsumer()
+
+    def parse(self, handle):
+        """Parse BLAST handle into a Record.Blast object."""
+        self._scanner.feed(handle, self._consumer)
+        return self._consumer.data
+
+
+class PSIBlastParser(AbstractParser):
+    """Parses BLAST data into a Record.PSIBlast object."""
+
+    def __init__(self):
+        """Initialize the class."""
+        self._scanner = _Scanner()
+        self._consumer = _PSIBlastConsumer()
+
+    def parse(self, handle):
+        """Parse BLAST handle into a Record.PSIBlast object."""
+        self._scanner.feed(handle, self._consumer)
+        return self._consumer.data
+
+
+class _HeaderConsumer:
+    def start_header(self):
+        self._header = Record.Header()
+
+    def version(self, line):
+        c = line.split()
+        self._header.application = c[0]
+        self._header.version = c[1]
+        if len(c) > 2:
+            # The date is missing in the new C++ output from blastx 2.2.22+
+            # Just get "BLASTX 2.2.22+\n" and that's all.
+            self._header.date = c[2][1:-1]
+
+    def reference(self, line):
+        if line.startswith("Reference: "):
+            self._header.reference = line[11:]
+        else:
+            self._header.reference += line
+
+    def query_info(self, line):
+        if line.startswith("Query= "):
+            self._header.query = line[7:].lstrip()
+        elif line.startswith("Length="):
+            # New style way to give the query length in BLAST 2.2.22+ (the C++ code)
+            self._header.query_letters = _safe_int(line[7:].strip())
+        elif not line.startswith("       "):  # continuation of query_info
+            self._header.query = "%s%s" % (self._header.query, line)
+        else:
+            # Hope it is the old style way to give the query length:
+            (letters,) = _re_search(
+                r"([0-9,]+) letters",
+                line,
+                "I could not find the number of letters in line\n%s" % line,
+            )
+            self._header.query_letters = _safe_int(letters)
+
+    def database_info(self, line):
+        line = line.rstrip()
+        if line.startswith("Database: "):
+            self._header.database = line[10:]
+        elif not line.endswith("total letters"):
+            if self._header.database:
+                # Need to include a space when merging multi line datase descr
+                self._header.database += " " + line.strip()
+            else:
+                self._header.database = line.strip()
+        else:
+            sequences, letters = _re_search(
+                r"([0-9,]+) sequences; ([0-9,-]+) total letters",
+                line,
+                "I could not find the sequences and letters in line\n%s" % line,
+            )
+            self._header.database_sequences = _safe_int(sequences)
+            self._header.database_letters = _safe_int(letters)
+
+    def end_header(self):
+        # Get rid of the trailing newlines
+        self._header.reference = self._header.reference.rstrip()
+        self._header.query = self._header.query.rstrip()
+
+
+class _DescriptionConsumer:
+    def start_descriptions(self):
+        self._descriptions = []
+        self._model_sequences = []
+        self._nonmodel_sequences = []
+        self._converged = 0
+        self._type = None
+        self._roundnum = None
+
+        self.__has_n = 0  # Does the description line contain an N value?
+
+    def description_header(self, line):
+        if line.startswith("Sequences producing"):
+            cols = line.split()
+            if cols[-1] == "N":
+                self.__has_n = 1
+
+    def description(self, line):
+        dh = self._parse(line)
+        if self._type == "model":
+            self._model_sequences.append(dh)
+        elif self._type == "nonmodel":
+            self._nonmodel_sequences.append(dh)
+        else:
+            self._descriptions.append(dh)
+
+    def model_sequences(self, line):
+        self._type = "model"
+
+    def nonmodel_sequences(self, line):
+        self._type = "nonmodel"
+
+    def converged(self, line):
+        self._converged = 1
+
+    def no_hits(self, line):
+        pass
+
+    def round(self, line):
+        if not line.startswith("Results from round"):
+            raise ValueError("I didn't understand the round line\n%s" % line)
+        self._roundnum = _safe_int(line[18:].strip())
+
+    def end_descriptions(self):
+        pass
+
+    def _parse(self, description_line):
+        line = description_line  # for convenience
+        dh = Record.Description()
+
+        # I need to separate the score and p-value from the title.
+        # sp|P21297|FLBT_CAUCR FLBT PROTEIN     [snip]         284  7e-77
+        # sp|P21297|FLBT_CAUCR FLBT PROTEIN     [snip]         284  7e-77  1
+        # special cases to handle:
+        #   - title must be preserved exactly (including whitespaces)
+        #   - score could be equal to e-value (not likely, but what if??)
+        #   - sometimes there's an "N" score of '1'.
+        cols = line.split()
+        if len(cols) < 3:
+            raise ValueError("Line does not appear to contain description:\n%s" % line)
+        if self.__has_n:
+            i = line.rfind(cols[-1])  # find start of N
+            i = line.rfind(cols[-2], 0, i)  # find start of p-value
+            i = line.rfind(cols[-3], 0, i)  # find start of score
+        else:
+            i = line.rfind(cols[-1])  # find start of p-value
+            i = line.rfind(cols[-2], 0, i)  # find start of score
+        if self.__has_n:
+            dh.title, dh.score, dh.e, dh.num_alignments = (
+                line[:i].rstrip(),
+                cols[-3],
+                cols[-2],
+                cols[-1],
+            )
+        else:
+            dh.title, dh.score, dh.e, dh.num_alignments = (
+                line[:i].rstrip(),
+                cols[-2],
+                cols[-1],
+                1,
+            )
+        dh.num_alignments = _safe_int(dh.num_alignments)
+        dh.score = _safe_int(dh.score)
+        dh.e = _safe_float(dh.e)
+        return dh
+
+
+class _AlignmentConsumer:
+    # This is a little bit tricky.  An alignment can either be a
+    # pairwise alignment or a multiple alignment.  Since it's difficult
+    # to know a-priori which one the blast record will contain, I'm going
+    # to make one class that can parse both of them.
+    def start_alignment(self):
+        self._alignment = Record.Alignment()
+        self._multiple_alignment = Record.MultipleAlignment()
+
+    def title(self, line):
+        if self._alignment.title:
+            self._alignment.title += " "
+        self._alignment.title += line.strip()
+
+    def length(self, line):
+        # e.g. "Length = 81" or more recently, "Length=428"
+        parts = line.replace(" ", "").split("=")
+        if len(parts) != 2:
+            raise ValueError("Unrecognised format length line: %r" % line)
+        self._alignment.length = parts[1]
+        self._alignment.length = _safe_int(self._alignment.length)
+
+    def multalign(self, line):
+        # Standalone version uses 'QUERY', while WWW version uses blast_tmp.
+        if line.startswith("QUERY") or line.startswith("blast_tmp"):
+            # If this is the first line of the multiple alignment,
+            # then I need to figure out how the line is formatted.
+
+            # Format of line is:
+            # QUERY 1   acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60
+            try:
+                name, start, seq, end = line.split()
+            except ValueError:
+                raise ValueError("I do not understand the line\n%s" % line) from None
+            self._start_index = line.index(start, len(name))
+            self._seq_index = line.index(seq, self._start_index + len(start))
+            # subtract 1 for the space
+            self._name_length = self._start_index - 1
+            self._start_length = self._seq_index - self._start_index - 1
+            self._seq_length = line.rfind(end) - self._seq_index - 1
+
+            # self._seq_index = line.index(seq)
+            # # subtract 1 for the space
+            # self._seq_length = line.rfind(end) - self._seq_index - 1
+            # self._start_index = line.index(start)
+            # self._start_length = self._seq_index - self._start_index - 1
+            # self._name_length = self._start_index
+
+        # Extract the information from the line
+        name = line[: self._name_length]
+        name = name.rstrip()
+        start = line[self._start_index : self._start_index + self._start_length]
+        start = start.rstrip()
+        if start:
+            start = _safe_int(start)
+        end = line[self._seq_index + self._seq_length :].rstrip()
+        if end:
+            end = _safe_int(end)
+        seq = line[self._seq_index : self._seq_index + self._seq_length].rstrip()
+        # right pad the sequence with spaces if necessary
+        if len(seq) < self._seq_length:
+            seq += " " * (self._seq_length - len(seq))
+
+        # I need to make sure the sequence is aligned correctly with the query.
+        # First, I will find the length of the query.  Then, if necessary,
+        # I will pad my current sequence with spaces so that they will line
+        # up correctly.
+
+        # Two possible things can happen:
+        # QUERY
+        # 504
+        #
+        # QUERY
+        # 403
+        #
+        # Sequence 504 will need padding at the end.  Since I won't know
+        # this until the end of the alignment, this will be handled in
+        # end_alignment.
+        # Sequence 403 will need padding before being added to the alignment.
+
+        align = self._multiple_alignment.alignment  # for convenience
+        align.append((name, start, seq, end))
+
+        # This is old code that tried to line up all the sequences
+        # in a multiple alignment by using the sequence title's as
+        # identifiers.  The problem with this is that BLAST assigns
+        # different HSP's from the same sequence the same id.  Thus,
+        # in one alignment block, there may be multiple sequences with
+        # the same id.  I'm not sure how to handle this, so I'm not
+        # going to.
+
+        # # If the sequence is the query, then just add it.
+        # if name == 'QUERY':
+        #     if len(align) == 0:
+        #         align.append((name, start, seq))
+        #     else:
+        #         aname, astart, aseq = align[0]
+        #         if name != aname:
+        #             raise ValueError, "Query is not the first sequence"
+        #         aseq = aseq + seq
+        #         align[0] = aname, astart, aseq
+        # else:
+        #     if len(align) == 0:
+        #         raise ValueError, "I could not find the query sequence"
+        #     qname, qstart, qseq = align[0]
+        #
+        #     # Now find my sequence in the multiple alignment.
+        #     for i in range(1, len(align)):
+        #         aname, astart, aseq = align[i]
+        #         if name == aname:
+        #             index = i
+        #             break
+        #     else:
+        #         # If I couldn't find it, then add a new one.
+        #         align.append((None, None, None))
+        #         index = len(align)-1
+        #         # Make sure to left-pad it.
+        #         aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq))
+        #
+        #     if len(qseq) != len(aseq) + len(seq):
+        #         # If my sequences are shorter than the query sequence,
+        #         # then I will need to pad some spaces to make them line up.
+        #         # Since I've already right padded seq, that means aseq
+        #         # must be too short.
+        #         aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq))
+        #     aseq = aseq + seq
+        #     if astart is None:
+        #         astart = start
+        #     align[index] = aname, astart, aseq
+
+    def end_alignment(self):
+        # Remove trailing newlines
+        if self._alignment:
+            self._alignment.title = self._alignment.title.rstrip()
+
+        # This code is also obsolete.  See note above.
+        # If there's a multiple alignment, I will need to make sure
+        # all the sequences are aligned.  That is, I may need to
+        # right-pad the sequences.
+        # if self._multiple_alignment is not None:
+        #     align = self._multiple_alignment.alignment
+        #     seqlen = None
+        #     for i in range(len(align)):
+        #         name, start, seq = align[i]
+        #         if seqlen is None:
+        #             seqlen = len(seq)
+        #         else:
+        #             if len(seq) < seqlen:
+        #                 seq = seq + ' '*(seqlen - len(seq))
+        #                 align[i] = name, start, seq
+        #             elif len(seq) > seqlen:
+        #                 raise ValueError, \
+        #                       "Sequence %s is longer than the query" % name
+
+        # Clean up some variables, if they exist.
+        try:
+            del self._seq_index
+            del self._seq_length
+            del self._start_index
+            del self._start_length
+            del self._name_length
+        except AttributeError:
+            pass
+
+
+class _HSPConsumer:
+    def start_hsp(self):
+        self._hsp = Record.HSP()
+
+    def score(self, line):
+        self._hsp.bits, self._hsp.score = _re_search(
+            r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)",
+            line,
+            "I could not find the score in line\n%s" % line,
+        )
+        self._hsp.score = _safe_float(self._hsp.score)
+        self._hsp.bits = _safe_float(self._hsp.bits)
+
+        x, y = _re_search(
+            r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)",
+            line,
+            "I could not find the expect in line\n%s" % line,
+        )
+        if x:
+            self._hsp.num_alignments = _safe_int(x)
+        else:
+            self._hsp.num_alignments = 1
+        self._hsp.expect = _safe_float(y)
+
+    def identities(self, line):
+        x, y = _re_search(
+            r"Identities = (\d+)\/(\d+)",
+            line,
+            "I could not find the identities in line\n%s" % line,
+        )
+        self._hsp.identities = _safe_int(x), _safe_int(y)
+        self._hsp.align_length = _safe_int(y)
+
+        if "Positives" in line:
+            x, y = _re_search(
+                r"Positives = (\d+)\/(\d+)",
+                line,
+                "I could not find the positives in line\n%s" % line,
+            )
+            self._hsp.positives = _safe_int(x), _safe_int(y)
+            assert self._hsp.align_length == _safe_int(y)
+
+        if "Gaps" in line:
+            x, y = _re_search(
+                r"Gaps = (\d+)\/(\d+)",
+                line,
+                "I could not find the gaps in line\n%s" % line,
+            )
+            self._hsp.gaps = _safe_int(x), _safe_int(y)
+            assert self._hsp.align_length == _safe_int(y)
+
+    def strand(self, line):
+        self._hsp.strand = _re_search(
+            r"Strand\s?=\s?(\w+)\s?/\s?(\w+)",
+            line,
+            "I could not find the strand in line\n%s" % line,
+        )
+
+    def frame(self, line):
+        # Frame can be in formats:
+        # Frame = +1
+        # Frame = +2 / +2
+        if "/" in line:
+            self._hsp.frame = _re_search(
+                r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])",
+                line,
+                "I could not find the frame in line\n%s" % line,
+            )
+        else:
+            self._hsp.frame = _re_search(
+                r"Frame = ([-+][123])",
+                line,
+                "I could not find the frame in line\n%s" % line,
+            )
+
+    # Match a space, if one is available.  Masahir Ishikawa found a
+    # case where there's no space between the start and the sequence:
+    # Query: 100tt 101
+    # line below modified by Yair Benita, Sep 2004
+    # Note that the colon is not always present. 2006
+    _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
+
+    def query(self, line):
+        m = self._query_re.search(line)
+        if m is None:
+            if (
+                line.strip()
+                == "Query        ------------------------------------------------------------"
+            ):
+                # Special case - long gap relative to the subject,
+                # note there is no start/end present, cannot update those
+                self._hsp.query += "-" * 60
+                self._query_len = 60  # number of dashes
+                self._query_start_index = 13  # offset of first dash
+                return
+            raise ValueError("I could not find the query in line\n%s" % line)
+
+        # line below modified by Yair Benita, Sep 2004.
+        # added the end attribute for the query
+        colon, start, seq, end = m.groups()
+        seq = seq.strip()
+        self._hsp.query += seq
+        if self._hsp.query_start is None:
+            self._hsp.query_start = _safe_int(start)
+
+        # line below added by Yair Benita, Sep 2004.
+        # added the end attribute for the query
+        self._hsp.query_end = _safe_int(end)
+
+        # Get index for sequence start (regular expression element 3)
+        self._query_start_index = m.start(3)
+        self._query_len = len(seq)
+
+    def align(self, line):
+        seq = line[self._query_start_index :].rstrip()
+        if len(seq) < self._query_len:
+            # Make sure the alignment is the same length as the query
+            seq += " " * (self._query_len - len(seq))
+            if len(seq) < self._query_len:
+                raise ValueError("Match is longer than the query in line\n%s" % line)
+        self._hsp.match += seq
+
+    # To match how we do the query, cache the regular expression.
+    # Note that the colon is not always present.
+    _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
+
+    def sbjct(self, line):
+        m = self._sbjct_re.search(line)
+        if m is None:
+            raise ValueError("I could not find the sbjct in line\n%s" % line)
+        colon, start, seq, end = m.groups()
+        # mikep 26/9/00
+        # On occasion, there is a blast hit with no subject match
+        # so far, it only occurs with 1-line short "matches"
+        # I have decided to let these pass as they appear
+        if not seq.strip():
+            seq = " " * self._query_len
+        else:
+            seq = seq.strip()
+        self._hsp.sbjct += seq
+        if self._hsp.sbjct_start is None:
+            self._hsp.sbjct_start = _safe_int(start)
+
+        self._hsp.sbjct_end = _safe_int(end)
+        if len(seq) != self._query_len:
+            raise ValueError(
+                "QUERY and SBJCT sequence lengths don't match (%i %r vs %i) in line\n%s"
+                % (self._query_len, self._hsp.query, len(seq), line)
+            )
+
+        del self._query_start_index  # clean up unused variables
+        del self._query_len
+
+    def end_hsp(self):
+        pass
+
+
+class _DatabaseReportConsumer:
+    def start_database_report(self):
+        self._dr = Record.DatabaseReport()
+
+    def database(self, line):
+        m = re.search(r"Database: (.+)$", line)
+        if m:
+            self._dr.database_name.append(m.group(1))
+        elif self._dr.database_name:
+            # This must be a continuation of the previous name.
+            self._dr.database_name[-1] = "%s%s" % (
+                self._dr.database_name[-1],
+                line.strip(),
+            )
+
+    def posted_date(self, line):
+        self._dr.posted_date.append(
+            _re_search(
+                r"Posted date:\s*(.+)$",
+                line,
+                "I could not find the posted date in line\n%s" % line,
+            )
+        )
+
+    def num_letters_in_database(self, line):
+        (letters,) = _get_cols(
+            line, (-1,), ncols=6, expected={2: "letters", 4: "database:"}
+        )
+        self._dr.num_letters_in_database.append(_safe_int(letters))
+
+    def num_sequences_in_database(self, line):
+        (sequences,) = _get_cols(
+            line, (-1,), ncols=6, expected={2: "sequences", 4: "database:"}
+        )
+        self._dr.num_sequences_in_database.append(_safe_int(sequences))
+
+    def ka_params(self, line):
+        self._dr.ka_params = [_safe_float(x) for x in line.split()]
+
+    def gapped(self, line):
+        self._dr.gapped = 1
+
+    def ka_params_gap(self, line):
+        self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
+
+    def end_database_report(self):
+        pass
+
+
+class _ParametersConsumer:
+    def start_parameters(self):
+        self._params = Record.Parameters()
+
+    def matrix(self, line):
+        self._params.matrix = line[8:].rstrip()
+
+    def gap_penalties(self, line):
+        self._params.gap_penalties = [
+            _safe_float(x)
+            for x in _get_cols(
+                line, (3, 5), ncols=6, expected={2: "Existence:", 4: "Extension:"}
+            )
+        ]
+
+    def num_hits(self, line):
+        if "1st pass" in line:
+            (x,) = _get_cols(line, (-4,), ncols=11, expected={2: "Hits"})
+            self._params.num_hits = _safe_int(x)
+        else:
+            (x,) = _get_cols(line, (-1,), ncols=6, expected={2: "Hits"})
+            self._params.num_hits = _safe_int(x)
+
+    def num_sequences(self, line):
+        if "1st pass" in line:
+            (x,) = _get_cols(line, (-4,), ncols=9, expected={2: "Sequences:"})
+            self._params.num_sequences = _safe_int(x)
+        else:
+            (x,) = _get_cols(line, (-1,), ncols=4, expected={2: "Sequences:"})
+            self._params.num_sequences = _safe_int(x)
+
+    def num_extends(self, line):
+        if "1st pass" in line:
+            (x,) = _get_cols(line, (-4,), ncols=9, expected={2: "extensions:"})
+            self._params.num_extends = _safe_int(x)
+        else:
+            (x,) = _get_cols(line, (-1,), ncols=4, expected={2: "extensions:"})
+            self._params.num_extends = _safe_int(x)
+
+    def num_good_extends(self, line):
+        if "1st pass" in line:
+            (x,) = _get_cols(line, (-4,), ncols=10, expected={3: "extensions:"})
+            self._params.num_good_extends = _safe_int(x)
+        else:
+            (x,) = _get_cols(line, (-1,), ncols=5, expected={3: "extensions:"})
+            self._params.num_good_extends = _safe_int(x)
+
+    def num_seqs_better_e(self, line):
+        (self._params.num_seqs_better_e,) = _get_cols(
+            line, (-1,), ncols=7, expected={2: "sequences"}
+        )
+        self._params.num_seqs_better_e = _safe_int(self._params.num_seqs_better_e)
+
+    def hsps_no_gap(self, line):
+        (self._params.hsps_no_gap,) = _get_cols(
+            line, (-1,), ncols=9, expected={3: "better", 7: "gapping:"}
+        )
+        self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
+
+    def hsps_prelim_gapped(self, line):
+        (self._params.hsps_prelim_gapped,) = _get_cols(
+            line, (-1,), ncols=9, expected={4: "gapped", 6: "prelim"}
+        )
+        self._params.hsps_prelim_gapped = _safe_int(self._params.hsps_prelim_gapped)
+
+    def hsps_prelim_gapped_attempted(self, line):
+        (self._params.hsps_prelim_gapped_attempted,) = _get_cols(
+            line, (-1,), ncols=10, expected={4: "attempted", 7: "prelim"}
+        )
+        self._params.hsps_prelim_gapped_attempted = _safe_int(
+            self._params.hsps_prelim_gapped_attempted
+        )
+
+    def hsps_gapped(self, line):
+        (self._params.hsps_gapped,) = _get_cols(
+            line, (-1,), ncols=6, expected={3: "gapped"}
+        )
+        self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
+
+    def query_length(self, line):
+        (self._params.query_length,) = _get_cols(
+            line.lower(), (-1,), ncols=4, expected={0: "length", 2: "query:"}
+        )
+        self._params.query_length = _safe_int(self._params.query_length)
+
+    def database_length(self, line):
+        (self._params.database_length,) = _get_cols(
+            line.lower(), (-1,), ncols=4, expected={0: "length", 2: "database:"}
+        )
+        self._params.database_length = _safe_int(self._params.database_length)
+
+    def effective_hsp_length(self, line):
+        (self._params.effective_hsp_length,) = _get_cols(
+            line, (-1,), ncols=4, expected={1: "HSP", 2: "length:"}
+        )
+        self._params.effective_hsp_length = _safe_int(self._params.effective_hsp_length)
+
+    def effective_query_length(self, line):
+        (self._params.effective_query_length,) = _get_cols(
+            line, (-1,), ncols=5, expected={1: "length", 3: "query:"}
+        )
+        self._params.effective_query_length = _safe_int(
+            self._params.effective_query_length
+        )
+
+    def effective_database_length(self, line):
+        (self._params.effective_database_length,) = _get_cols(
+            line.lower(), (-1,), ncols=5, expected={1: "length", 3: "database:"}
+        )
+        self._params.effective_database_length = _safe_int(
+            self._params.effective_database_length
+        )
+
+    def effective_search_space(self, line):
+        (self._params.effective_search_space,) = _get_cols(
+            line, (-1,), ncols=4, expected={1: "search"}
+        )
+        self._params.effective_search_space = _safe_int(
+            self._params.effective_search_space
+        )
+
+    def effective_search_space_used(self, line):
+        (self._params.effective_search_space_used,) = _get_cols(
+            line, (-1,), ncols=5, expected={1: "search", 3: "used:"}
+        )
+        self._params.effective_search_space_used = _safe_int(
+            self._params.effective_search_space_used
+        )
+
+    def frameshift(self, line):
+        self._params.frameshift = _get_cols(
+            line, (4, 5), ncols=6, expected={0: "frameshift", 2: "decay"}
+        )
+
+    def threshold(self, line):
+        if line[:2] == "T:":
+            # Assume its an old style line like "T: 123"
+            (self._params.threshold,) = _get_cols(
+                line, (1,), ncols=2, expected={0: "T:"}
+            )
+        elif line[:28] == "Neighboring words threshold:":
+            (self._params.threshold,) = _get_cols(
+                line,
+                (3,),
+                ncols=4,
+                expected={0: "Neighboring", 1: "words", 2: "threshold:"},
+            )
+        else:
+            raise ValueError("Unrecognised threshold line:\n%s" % line)
+        self._params.threshold = _safe_int(self._params.threshold)
+
+    def window_size(self, line):
+        if line[:2] == "A:":
+            (self._params.window_size,) = _get_cols(
+                line, (1,), ncols=2, expected={0: "A:"}
+            )
+        elif line[:25] == "Window for multiple hits:":
+            (self._params.window_size,) = _get_cols(
+                line, (4,), ncols=5, expected={0: "Window", 2: "multiple", 3: "hits:"}
+            )
+        else:
+            raise ValueError("Unrecognised window size line:\n%s" % line)
+        self._params.window_size = _safe_int(self._params.window_size)
+
+    def dropoff_1st_pass(self, line):
+        score, bits = _re_search(
+            r"X1: (\d+) \(\s*([0-9,.]+) bits\)",
+            line,
+            "I could not find the dropoff in line\n%s" % line,
+        )
+        self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
+
+    def gap_x_dropoff(self, line):
+        score, bits = _re_search(
+            r"X2: (\d+) \(\s*([0-9,.]+) bits\)",
+            line,
+            "I could not find the gap dropoff in line\n%s" % line,
+        )
+        self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
+
+    def gap_x_dropoff_final(self, line):
+        score, bits = _re_search(
+            r"X3: (\d+) \(\s*([0-9,.]+) bits\)",
+            line,
+            "I could not find the gap dropoff final in line\n%s" % line,
+        )
+        self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
+
+    def gap_trigger(self, line):
+        score, bits = _re_search(
+            r"S1: (\d+) \(\s*([0-9,.]+) bits\)",
+            line,
+            "I could not find the gap trigger in line\n%s" % line,
+        )
+        self._params.gap_trigger = _safe_int(score), _safe_float(bits)
+
+    def blast_cutoff(self, line):
+        score, bits = _re_search(
+            r"S2: (\d+) \(\s*([0-9,.]+) bits\)",
+            line,
+            "I could not find the blast cutoff in line\n%s" % line,
+        )
+        self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
+
+    def end_parameters(self):
+        pass
+
+
+class _BlastConsumer(
+    AbstractConsumer,
+    _HeaderConsumer,
+    _DescriptionConsumer,
+    _AlignmentConsumer,
+    _HSPConsumer,
+    _DatabaseReportConsumer,
+    _ParametersConsumer,
+):
+    # This Consumer is inherits from many other consumer classes that handle
+    # the actual dirty work.  An alternate way to do it is to create objects
+    # of those classes and then delegate the parsing tasks to them in a
+    # decorator-type pattern.  The disadvantage of that is that the method
+    # names will need to be resolved in this classes.  However, using
+    # a decorator will retain more control in this class (which may or
+    # may not be a bad thing).  In addition, having each sub-consumer as
+    # its own object prevents this object's dictionary from being cluttered
+    # with members and reduces the chance of member collisions.
+    def __init__(self):
+        self.data = None
+
+    def round(self, line):
+        # Make sure nobody's trying to pass me PSI-BLAST data!
+        raise ValueError("This consumer doesn't handle PSI-BLAST data")
+
+    def start_header(self):
+        self.data = Record.Blast()
+        _HeaderConsumer.start_header(self)
+
+    def end_header(self):
+        _HeaderConsumer.end_header(self)
+        self.data.__dict__.update(self._header.__dict__)
+
+    def end_descriptions(self):
+        self.data.descriptions = self._descriptions
+
+    def end_alignment(self):
+        _AlignmentConsumer.end_alignment(self)
+        if self._alignment.hsps:
+            self.data.alignments.append(self._alignment)
+        if self._multiple_alignment.alignment:
+            self.data.multiple_alignment = self._multiple_alignment
+
+    def end_hsp(self):
+        _HSPConsumer.end_hsp(self)
+        try:
+            self._alignment.hsps.append(self._hsp)
+        except AttributeError:
+            raise ValueError("Found an HSP before an alignment") from None
+
+    def end_database_report(self):
+        _DatabaseReportConsumer.end_database_report(self)
+        self.data.__dict__.update(self._dr.__dict__)
+
+    def end_parameters(self):
+        _ParametersConsumer.end_parameters(self)
+        self.data.__dict__.update(self._params.__dict__)
+
+
+class _PSIBlastConsumer(
+    AbstractConsumer,
+    _HeaderConsumer,
+    _DescriptionConsumer,
+    _AlignmentConsumer,
+    _HSPConsumer,
+    _DatabaseReportConsumer,
+    _ParametersConsumer,
+):
+    def __init__(self):
+        self.data = None
+
+    def start_header(self):
+        self.data = Record.PSIBlast()
+        _HeaderConsumer.start_header(self)
+
+    def end_header(self):
+        _HeaderConsumer.end_header(self)
+        self.data.__dict__.update(self._header.__dict__)
+
+    def start_descriptions(self):
+        self._round = Record.Round()
+        self.data.rounds.append(self._round)
+        _DescriptionConsumer.start_descriptions(self)
+
+    def end_descriptions(self):
+        _DescriptionConsumer.end_descriptions(self)
+        self._round.number = self._roundnum
+        if self._descriptions:
+            self._round.new_seqs.extend(self._descriptions)
+        self._round.reused_seqs.extend(self._model_sequences)
+        self._round.new_seqs.extend(self._nonmodel_sequences)
+        if self._converged:
+            self.data.converged = 1
+
+    def end_alignment(self):
+        _AlignmentConsumer.end_alignment(self)
+        if self._alignment.hsps:
+            self._round.alignments.append(self._alignment)
+        if self._multiple_alignment:
+            self._round.multiple_alignment = self._multiple_alignment
+
+    def end_hsp(self):
+        _HSPConsumer.end_hsp(self)
+        try:
+            self._alignment.hsps.append(self._hsp)
+        except AttributeError:
+            raise ValueError("Found an HSP before an alignment") from None
+
+    def end_database_report(self):
+        _DatabaseReportConsumer.end_database_report(self)
+        self.data.__dict__.update(self._dr.__dict__)
+
+    def end_parameters(self):
+        _ParametersConsumer.end_parameters(self)
+        self.data.__dict__.update(self._params.__dict__)
+
+
+class Iterator:
+    """Iterates over a file of multiple BLAST results.
+
+    Methods:
+    next   Return the next record from the stream, or None.
+
+    """
+
+    def __init__(self, handle, parser=None):
+        """Initialize a new iterator.
+
+        Arguments:
+         - handle is a file-like object.
+         - parser is an optional Parser object to change the results
+           into another form.  If set to None, then the raw contents
+           of the file will be returned.
+        """
+        try:
+            handle.readline
+        except AttributeError:
+            raise ValueError(
+                "I expected a file handle or file-like object, got %s" % type(handle)
+            ) from None
+        self._uhandle = UndoHandle(handle)
+        self._parser = parser
+        self._header = []
+
+    def __next__(self):
+        """Return the next Blast record from the file.
+
+        If no more records, return None.
+        """
+        lines = []
+        query = False
+        while True:
+            line = self._uhandle.readline()
+            if not line:
+                break
+            # If I've reached the next one, then put the line back and stop.
+            if lines and (
+                line.startswith("BLAST")
+                or line.startswith("BLAST", 1)
+                or line.startswith(">> int("5399354557888517312")
+    # 5399354557888517312
+    # >>> int(float("5399354557888517312"))
+    # 5399354557888517120
+    return int(float(str))
+
+
+def _safe_float(str):
+    # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that
+    # float('e-172') does not produce an error on his platform.  Thus,
+    # we need to check the string for this condition.
+
+    # Sometimes BLAST leaves of the '1' in front of an exponent.
+    if str and str[0] in ["E", "e"]:
+        str = "1" + str
+    try:
+        return float(str)
+    except ValueError:
+        # Remove all commas from the string
+        str = str.replace(",", "")
+    # try again.
+    return float(str)
+
+
+class _BlastErrorConsumer(_BlastConsumer):
+    def __init__(self):
+        _BlastConsumer.__init__(self)
+
+    def noevent(self, line):
+        if "Query must be at least wordsize" in line:
+            raise ShortQueryBlastError("Query must be at least wordsize")
+        # Now pass the line back up to the superclass.
+        method = getattr(
+            _BlastConsumer, "noevent", _BlastConsumer.__getattr__(self, "noevent")
+        )
+        method(line)
+
+
+class BlastErrorParser(AbstractParser):
+    """Attempt to catch and diagnose BLAST errors while parsing.
+
+    This utilizes the BlastParser module but adds an additional layer
+    of complexity on top of it by attempting to diagnose ValueErrors
+    that may actually indicate problems during BLAST parsing.
+
+    Current BLAST problems this detects are:
+    - LowQualityBlastError - When BLASTing really low quality sequences
+    (ie. some GenBank entries which are just short stretches of a single
+    nucleotide), BLAST will report an error with the sequence and be
+    unable to search with this. This will lead to a badly formatted
+    BLAST report that the parsers choke on. The parser will convert the
+    ValueError to a LowQualityBlastError and attempt to provide useful
+    information.
+    """
+
+    def __init__(self, bad_report_handle=None):
+        """Initialize a parser that tries to catch BlastErrors.
+
+        Arguments:
+        - bad_report_handle - An optional argument specifying a handle
+        where bad reports should be sent. This would allow you to save
+        all of the bad reports to a file, for instance. If no handle
+        is specified, the bad reports will not be saved.
+        """
+        self._bad_report_handle = bad_report_handle
+
+        # self._b_parser = BlastParser()
+        self._scanner = _Scanner()
+        self._consumer = _BlastErrorConsumer()
+
+    def parse(self, handle):
+        """Parse a handle, attempting to diagnose errors."""
+        results = handle.read()
+
+        try:
+            self._scanner.feed(StringIO(results), self._consumer)
+        except ValueError:
+            # if we have a bad_report_file, save the info to it first
+            if self._bad_report_handle:
+                # send the info to the error handle
+                self._bad_report_handle.write(results)
+
+            # now we want to try and diagnose the error
+            self._diagnose_error(StringIO(results), self._consumer.data)
+
+            # if we got here we can't figure out the problem
+            # so we should pass along the syntax error we got
+            raise
+        return self._consumer.data
+
+    def _diagnose_error(self, handle, data_record):
+        """Attempt to diagnose an error in the passed handle (PRIVATE).
+
+        Arguments:
+        - handle - The handle potentially containing the error
+        - data_record - The data record partially created by the consumer.
+        """
+        line = handle.readline()
+
+        while line:
+            # 'Searchingdone' instead of 'Searching......done' seems
+            # to indicate a failure to perform the BLAST due to
+            # low quality sequence
+            if line.startswith("Searchingdone"):
+                raise LowQualityBlastError(
+                    "Blast failure occurred on query: ", data_record.query
+                )
+            line = handle.readline()
diff --git a/code/lib/Bio/SearchIO/_legacy/ParserSupport.py b/code/lib/Bio/SearchIO/_legacy/ParserSupport.py
new file mode 100644
index 0000000..cea7499
--- /dev/null
+++ b/code/lib/Bio/SearchIO/_legacy/ParserSupport.py
@@ -0,0 +1,380 @@
+# Copyright 1999 by Jeffrey Chang.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Code to support writing parsers (DEPRECATED).
+
+Classes:
+ - UndoHandle             File object decorator with support for undo-like operations.
+ - AbstractParser         Base class for parsers.
+ - AbstractConsumer       Base class of all Consumers.
+ - TaggingConsumer        Consumer that tags output with its event.  For debugging
+
+Functions:
+ - safe_readline          Read a line from a handle, with check for EOF.
+ - safe_peekline          Peek at next line, with check for EOF.
+ - read_and_call          Read a line from a handle and pass it to a method.
+ - read_and_call_while    Read many lines, as long as a condition is met.
+ - read_and_call_until    Read many lines, until a condition is met.
+ - attempt_read_and_call  Like read_and_call, but forgiving of errors.
+ - is_blank_line          Test whether a line is blank.
+
+"""
+
+import sys
+from io import StringIO
+
+from abc import ABC, abstractmethod
+
+
+class UndoHandle:
+    """A Python handle that adds functionality for saving lines.
+
+    Saves lines in a LIFO fashion.
+    """
+
+    def __init__(self, handle):
+        """Initialize the class."""
+        self._handle = handle
+        self._saved = []
+        try:
+            # If wrapping an online handle, this this is nice to have:
+            self.url = handle.url
+        except AttributeError:
+            pass
+
+    def __iter__(self):
+        """Iterate over the lines in the File."""
+        return self
+
+    def __next__(self):
+        """Return the next line."""
+        next = self.readline()
+        if not next:
+            raise StopIteration
+        return next
+
+    def readlines(self, *args, **keywds):
+        """Read all the lines from the file as a list of strings."""
+        lines = self._saved + self._handle.readlines(*args, **keywds)
+        self._saved = []
+        return lines
+
+    def readline(self, *args, **keywds):
+        """Read the next line from the file as string."""
+        if self._saved:
+            line = self._saved.pop(0)
+        else:
+            line = self._handle.readline(*args, **keywds)
+        return line
+
+    def read(self, size=-1):
+        """Read the File."""
+        if size == -1:
+            saved = "".join(self._saved)
+            self._saved[:] = []
+        else:
+            saved = ""
+            while size > 0 and self._saved:
+                if len(self._saved[0]) <= size:
+                    size = size - len(self._saved[0])
+                    saved = saved + self._saved.pop(0)
+                else:
+                    saved = saved + self._saved[0][:size]
+                    self._saved[0] = self._saved[0][size:]
+                    size = 0
+        return saved + self._handle.read(size)
+
+    def saveline(self, line):
+        """Store a line in the cache memory for later use.
+
+        This acts to undo a readline, reflecting the name of the class: UndoHandle.
+        """
+        if line:
+            self._saved = [line] + self._saved
+
+    def peekline(self):
+        """Return the next line in the file, but do not move forward though the file."""
+        if self._saved:
+            line = self._saved[0]
+        else:
+            line = self._handle.readline()
+            self.saveline(line)
+        return line
+
+    def tell(self):
+        """Return the current position of the file read/write pointer within the File."""
+        return self._handle.tell() - sum(len(line) for line in self._saved)
+
+    def seek(self, *args):
+        """Set the current position at the offset specified."""
+        self._saved = []
+        self._handle.seek(*args)
+
+    def __getattr__(self, attr):
+        """Return File attribute."""
+        return getattr(self._handle, attr)
+
+    def __enter__(self):
+        """Call special method when opening the file using a with-statement."""
+        return self
+
+    def __exit__(self, type, value, traceback):
+        """Call special method when closing the file using a with-statement."""
+        self._handle.close()
+
+
+class AbstractParser(ABC):
+    """Abstract base class for other parsers."""
+
+    @abstractmethod
+    def parse(self, handle):
+        """Provision for parsing a file handle."""
+        raise NotImplementedError
+
+    def parse_str(self, string):
+        """Make string a handle, so it can be taken by parse."""
+        return self.parse(StringIO(string))
+
+    def parse_file(self, filename):
+        """Parse a file, open the file as handle so it can be taken by parse."""
+        with open(filename) as h:
+            retval = self.parse(h)
+        return retval
+
+
+class AbstractConsumer:
+    """Base class for other Consumers.
+
+    Derive Consumers from this class and implement appropriate
+    methods for each event that you want to receive.
+
+    """
+
+    # Optionally implement in the sub-class
+    def _unhandled_section(self):
+        pass
+
+    # Optionally implement in the sub-class
+    def _unhandled(self, data):
+        pass
+
+    def __getattr__(self, attr):
+        if attr[:6] == "start_" or attr[:4] == "end_":
+            method = self._unhandled_section
+        else:
+            method = self._unhandled
+        return method
+
+
+class TaggingConsumer(AbstractConsumer):
+    """Debugging consumer which tags data with the event and logs it.
+
+    This is a Consumer that tags the data stream with the event and
+    prints it to a handle.  Useful for debugging.
+
+    """
+
+    def __init__(self, handle=None, colwidth=15, maxwidth=80):
+        """Initialize.
+
+        Arguments:
+         - handle to log to, defaults to ``sys.stdout``
+         - colwidth for logging to the handle
+         - maxwidth for truncation when logging
+
+        """
+        # I can't assign sys.stdout to handle in the argument list.
+        # If I do that, handle will be assigned the value of sys.stdout
+        # the first time this function is called.  This will fail if
+        # the user has assigned sys.stdout to some other file, which may
+        # be closed or invalid at a later time.
+        if handle is None:
+            handle = sys.stdout
+        self._handle = handle
+        self._colwidth = colwidth
+        self._maxwidth = maxwidth
+
+    def unhandled_section(self):
+        """Tag an unhandled section."""
+        self._print_name("unhandled_section")
+
+    def unhandled(self, data):
+        """Tag unhandled data."""
+        self._print_name("unhandled", data)
+
+    def _print_name(self, name, data=None):
+        if data is None:
+            # Write the name of a section.
+            self._handle.write("%s %s\n" % ("*" * self._colwidth, name))
+        else:
+            # Write the tag and line.
+            self._handle.write(
+                "%-*s: %s\n"
+                % (
+                    self._colwidth,
+                    name[: self._colwidth],
+                    data[: self._maxwidth - self._colwidth - 2].rstrip(),
+                )
+            )
+
+    def __getattr__(self, attr):
+        if attr[:6] == "start_" or attr[:4] == "end_":
+            method = lambda a=attr, s=self: s._print_name(a)  # noqa: E731
+        else:
+            method = lambda x, a=attr, s=self: s._print_name(a, x)  # noqa: E731
+        return method
+
+
+def read_and_call(uhandle, method, **keywds):
+    """Read line and pass it to the method.
+
+    Read a line from uhandle, check it, and pass it to the method.
+    Raises a ValueError if the line does not pass the checks.
+
+    start, end, contains, blank, and has_re specify optional conditions
+    that the line must pass.  start and end specifies what the line must
+    begin or end with (not counting EOL characters).  contains
+    specifies a substring that must be found in the line.  If blank
+    is a true value, then the line must be blank.  has_re should be
+    a regular expression object with a pattern that the line must match
+    somewhere.
+
+    """
+    line = safe_readline(uhandle)
+    errmsg = _fails_conditions(*(line,), **keywds)
+    if errmsg is not None:
+        raise ValueError(errmsg)
+    method(line)
+
+
+def read_and_call_while(uhandle, method, **keywds):
+    """Read line and pass it to the method while condition is true.
+
+    Read a line from uhandle and pass it to the method as long as
+    some condition is true.  Returns the number of lines that were read.
+
+    See the docstring for read_and_call for a description of the parameters.
+
+    """
+    nlines = 0
+    while True:
+        line = safe_readline(uhandle)
+        # If I've failed the condition, then stop reading the line.
+        if _fails_conditions(*(line,), **keywds):
+            uhandle.saveline(line)
+            break
+        method(line)
+        nlines = nlines + 1
+    return nlines
+
+
+def read_and_call_until(uhandle, method, **keywds):
+    """Read line and pass it to the method until condition is true.
+
+    Read a line from uhandle and pass it to the method until
+    some condition is true.  Returns the number of lines that were read.
+
+    See the docstring for read_and_call for a description of the parameters.
+
+    """
+    nlines = 0
+    while True:
+        line = safe_readline(uhandle)
+        # If I've met the condition, then stop reading the line.
+        if not _fails_conditions(*(line,), **keywds):
+            uhandle.saveline(line)
+            break
+        method(line)
+        nlines = nlines + 1
+    return nlines
+
+
+def attempt_read_and_call(uhandle, method, **keywds):
+    """Attempt read line and call method.
+
+    Similar to read_and_call, but returns a boolean specifying
+    whether the line has passed the checks.  Does not raise
+    exceptions.
+
+    See docs for read_and_call for a description of the function
+    arguments.
+
+    """
+    line = safe_readline(uhandle)
+    passed = not _fails_conditions(*(line,), **keywds)
+    if passed:
+        method(line)
+    else:
+        uhandle.saveline(line)
+    return passed
+
+
+def _fails_conditions(
+    line, start=None, end=None, contains=None, blank=None, has_re=None
+):
+    if start is not None:
+        if line[: len(start)] != start:
+            return "Line does not start with '%s':\n%s" % (start, line)
+    if end is not None:
+        if line.rstrip()[-len(end) :] != end:
+            return "Line does not end with '%s':\n%s" % (end, line)
+    if contains is not None:
+        if contains not in line:
+            return "Line does not contain '%s':\n%s" % (contains, line)
+    if blank is not None:
+        if blank:
+            if not is_blank_line(line):
+                return "Expected blank line, but got:\n%s" % line
+        else:
+            if is_blank_line(line):
+                return "Expected non-blank line, but got a blank one"
+    if has_re is not None:
+        if has_re.search(line) is None:
+            return "Line does not match regex '%s':\n%s" % (has_re.pattern, line)
+    return None
+
+
+def is_blank_line(line, allow_spaces=0):
+    """Check if a line is blank.
+
+    Return whether a line is blank.  allow_spaces specifies whether to
+    allow whitespaces in a blank line.  A true value signifies that a
+    line containing whitespaces as well as end-of-line characters
+    should be considered blank.
+
+    """
+    if not line:
+        return 1
+    if allow_spaces:
+        return line.rstrip() == ""
+    return line[0] == "\n" or line[0] == "\r"
+
+
+def safe_readline(handle):
+    """Read a line, otherwise raises ValueError.
+
+    Read a line from an UndoHandle and return it.  If there are no more
+    lines to read, I will raise a ValueError.
+
+    """
+    line = handle.readline()
+    if not line:
+        raise ValueError("Unexpected end of stream.")
+    return line
+
+
+def safe_peekline(handle):
+    """Peek at the next line if present, otherwise raises ValueError.
+
+    Peek at the next line in an UndoHandle and return it.  If there are no
+    more lines to peek, I will raise a ValueError.
+
+    """
+    line = handle.peekline()
+    if not line:
+        raise ValueError("Unexpected end of stream.")
+    return line
diff --git a/code/lib/Bio/SearchIO/_legacy/__init__.py b/code/lib/Bio/SearchIO/_legacy/__init__.py
new file mode 100644
index 0000000..618df08
--- /dev/null
+++ b/code/lib/Bio/SearchIO/_legacy/__init__.py
@@ -0,0 +1,5 @@
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Legacy functionalities from other parts of Biopython used by SearchIO."""
diff --git a/code/lib/Bio/SearchIO/_legacy/__pycache__/NCBIStandalone.cpython-37.pyc b/code/lib/Bio/SearchIO/_legacy/__pycache__/NCBIStandalone.cpython-37.pyc
new file mode 100644
index 0000000..ada6d15
Binary files /dev/null and b/code/lib/Bio/SearchIO/_legacy/__pycache__/NCBIStandalone.cpython-37.pyc differ
diff --git a/code/lib/Bio/SearchIO/_legacy/__pycache__/ParserSupport.cpython-37.pyc b/code/lib/Bio/SearchIO/_legacy/__pycache__/ParserSupport.cpython-37.pyc
new file mode 100644
index 0000000..b72fb05
Binary files /dev/null and b/code/lib/Bio/SearchIO/_legacy/__pycache__/ParserSupport.cpython-37.pyc differ
diff --git a/code/lib/Bio/SearchIO/_legacy/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SearchIO/_legacy/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..e9759bc
Binary files /dev/null and b/code/lib/Bio/SearchIO/_legacy/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/SearchIO/_model/__init__.py b/code/lib/Bio/SearchIO/_model/__init__.py
new file mode 100644
index 0000000..c81f503
--- /dev/null
+++ b/code/lib/Bio/SearchIO/_model/__init__.py
@@ -0,0 +1,59 @@
+# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SearchIO objects to model similarity search program outputs.
+
+The SearchIO object model consists of a hierarchy of four nested objects:
+
+    * QueryResult, to represent a search query.
+
+      This is the top-level object returned by the main SearchIO ``parse`` and
+      ``read`` functions. QueryResult objects may contain zero or more Hit
+      objects, each accessible by its ID string (like in Python dictionaries)
+      or integer index (like in Python lists).
+
+    * Hit, to represent a database entry containing a full or partial sequence
+      match with the query sequence.
+
+      Hit objects contain one or more HSP objects, each accessible by its integer
+      index. They behave very similar to a Python list.
+
+    * HSP, to represent a region of significant alignment(s) between the query
+      and hit sequences.
+
+      HSP objects contain one or more HSPFragment objects, each accessible by
+      its integer index. In most cases, the HSP objects are where the bulk of
+      search result statistics (e.g. e-value, bitscore) are stored. Like Hit
+      objects, HSPs also behave very similar to a Python list.
+
+    * HSPFragment, to represent a single contiguous alignment between the query
+      and hit sequences.
+
+      HSPFragment objects may store hit and query sequences resulting from the
+      sequence search. If present, these sequences are stored as SeqRecord
+      objects (see SeqRecord). If both of them are present, HSPFragment will
+      create a MultipleSeqAlignment object from both sequences.
+
+      Most search programs only have HSPs with one HSPFragment in them, making
+      these two objects inseparable. However, there are programs (e.g. BLAT and
+      Exonerate) which may have more than one HSPFragment objects in any given
+      HSP. If you are not using these programs, you can safely consider HSP and
+      HSPFragment as a single union.
+
+"""
+
+from .query import QueryResult
+from .hit import Hit
+from .hsp import HSP, HSPFragment
+
+
+__all__ = ("QueryResult", "Hit", "HSP", "HSPFragment")
+
+
+# if not used as a module, run the doctest
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SearchIO/_model/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SearchIO/_model/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..6efc357
Binary files /dev/null and b/code/lib/Bio/SearchIO/_model/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/SearchIO/_model/__pycache__/_base.cpython-37.pyc b/code/lib/Bio/SearchIO/_model/__pycache__/_base.cpython-37.pyc
new file mode 100644
index 0000000..fb5f3e0
Binary files /dev/null and b/code/lib/Bio/SearchIO/_model/__pycache__/_base.cpython-37.pyc differ
diff --git a/code/lib/Bio/SearchIO/_model/__pycache__/hit.cpython-37.pyc b/code/lib/Bio/SearchIO/_model/__pycache__/hit.cpython-37.pyc
new file mode 100644
index 0000000..898a65a
Binary files /dev/null and b/code/lib/Bio/SearchIO/_model/__pycache__/hit.cpython-37.pyc differ
diff --git a/code/lib/Bio/SearchIO/_model/__pycache__/hsp.cpython-37.pyc b/code/lib/Bio/SearchIO/_model/__pycache__/hsp.cpython-37.pyc
new file mode 100644
index 0000000..e11c670
Binary files /dev/null and b/code/lib/Bio/SearchIO/_model/__pycache__/hsp.cpython-37.pyc differ
diff --git a/code/lib/Bio/SearchIO/_model/__pycache__/query.cpython-37.pyc b/code/lib/Bio/SearchIO/_model/__pycache__/query.cpython-37.pyc
new file mode 100644
index 0000000..d74679d
Binary files /dev/null and b/code/lib/Bio/SearchIO/_model/__pycache__/query.cpython-37.pyc differ
diff --git a/code/lib/Bio/SearchIO/_model/_base.py b/code/lib/Bio/SearchIO/_model/_base.py
new file mode 100644
index 0000000..001755a
--- /dev/null
+++ b/code/lib/Bio/SearchIO/_model/_base.py
@@ -0,0 +1,68 @@
+# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Abstract base classes for the SearchIO object model."""
+
+
+from Bio.SearchIO._utils import getattr_str
+
+
+class _BaseSearchObject:
+    """Abstract class for SearchIO objects."""
+
+    _NON_STICKY_ATTRS = ()
+
+    def _transfer_attrs(self, obj):
+        """Transfer instance attributes to the given object (PRIVATE).
+
+        This method is used to transfer attributes set externally (for example
+        using ``setattr``) to a new object created from this one (for example
+        from slicing).
+
+        The reason this method is necessary is because different parsers will
+        set different attributes for each QueryResult, Hit, HSP, or HSPFragment
+        objects, depending on the attributes they found in the search output
+        file. Ideally, we want these attributes to 'stick' with any new instance
+        object created from the original one.
+
+        """
+        # list of attribute names we don't want to transfer
+        for attr in self.__dict__:
+            if attr not in self._NON_STICKY_ATTRS:
+                setattr(obj, attr, self.__dict__[attr])
+
+
+class _BaseHSP(_BaseSearchObject):
+    """Abstract base class for HSP objects."""
+
+    def _str_hsp_header(self):
+        """Print the alignment header info (PRIVATE)."""
+        lines = []
+        # set query id line
+        qid_line = "      Query: %s %s" % (self.query_id, self.query_description)
+        qid_line = qid_line[:77] + "..." if len(qid_line) > 80 else qid_line
+        # set hit id line
+        hid_line = "        Hit: %s %s" % (self.hit_id, self.hit_description)
+        hid_line = hid_line[:77] + "..." if len(hid_line) > 80 else hid_line
+        lines.append(qid_line)
+        lines.append(hid_line)
+
+        # coordinates
+        query_start = getattr_str(self, "query_start")
+        query_end = getattr_str(self, "query_end")
+        hit_start = getattr_str(self, "hit_start")
+        hit_end = getattr_str(self, "hit_end")
+
+        # strands
+        try:
+            qstrand = self.query_strand
+            hstrand = self.hit_strand
+        except ValueError:
+            qstrand = self.query_strand_all[0]
+            hstrand = self.hit_strand_all[0]
+        lines.append("Query range: [%s:%s] (%r)" % (query_start, query_end, qstrand))
+        lines.append("  Hit range: [%s:%s] (%r)" % (hit_start, hit_end, hstrand))
+
+        return "\n".join(lines)
diff --git a/code/lib/Bio/SearchIO/_model/hit.py b/code/lib/Bio/SearchIO/_model/hit.py
new file mode 100644
index 0000000..bd027db
--- /dev/null
+++ b/code/lib/Bio/SearchIO/_model/hit.py
@@ -0,0 +1,463 @@
+# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SearchIO object to model a single database hit."""
+
+
+from itertools import chain
+
+from Bio.SearchIO._utils import allitems, optionalcascade, getattr_str
+
+from ._base import _BaseSearchObject
+from .hsp import HSP
+
+
+class Hit(_BaseSearchObject):
+    """Class representing a single database hit of a search result.
+
+    Hit objects are the second-level container in the SearchIO module. They
+    are the objects contained within a QueryResult (see QueryResult). They
+    themselves are container for HSP objects and will contain at least one
+    HSP.
+
+    To have a quick look at a Hit and its contents, invoke ``print`` on it::
+
+        >>> from Bio import SearchIO
+        >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+        >>> hit = qresult[3]
+        >>> print(hit)
+        Query: 33211
+               mir_1
+          Hit: gi|301171322|ref|NR_035857.1| (86)
+               Pan troglodytes microRNA mir-520c (MIR520C), microRNA
+         HSPs: ----  --------  ---------  ------  ---------------  ---------------------
+                  #   E-value  Bit score    Span      Query range              Hit range
+               ----  --------  ---------  ------  ---------------  ---------------------
+                  0   8.9e-20     100.47      60           [1:61]                [13:73]
+                  1   3.3e-06      55.39      60           [0:60]                [13:73]
+
+    You can invoke ``len`` on a Hit object to see how many HSP objects it contains::
+
+        >>> len(hit)
+        2
+
+    Hit objects behave very similar to Python lists. You can retrieve the HSP
+    object inside a Hit using the HSP's integer index. Hit objects can also be
+    sliced, which will return a new Hit objects containing only the sliced HSPs::
+
+        # HSP items inside the Hit can be retrieved using its integer index
+        >>> hit[0]
+        HSP(hit_id='gi|301171322|ref|NR_035857.1|', query_id='33211', 1 fragments)
+
+        # slicing returns a new Hit
+        >>> hit
+        Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 2 hsps)
+        >>> hit[:1]
+        Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 1 hsps)
+        >>> print(hit[1:])
+        Query: 33211
+               mir_1
+          Hit: gi|301171322|ref|NR_035857.1| (86)
+               Pan troglodytes microRNA mir-520c (MIR520C), microRNA
+         HSPs: ----  --------  ---------  ------  ---------------  ---------------------
+                  #   E-value  Bit score    Span      Query range              Hit range
+               ----  --------  ---------  ------  ---------------  ---------------------
+                  0   3.3e-06      55.39      60           [0:60]                [13:73]
+
+    Hit objects provide ``filter`` and ``map`` methods, which are analogous to
+    Python's built-in ``filter`` and ``map`` except that they return a new Hit
+    object instead of a list.
+
+    Here is an example of using ``filter`` to select for HSPs whose e-value is
+    less than 1e-10::
+
+        >>> evalue_filter = lambda hsp: hsp.evalue < 1e-10
+        >>> filtered_hit = hit.filter(evalue_filter)
+        >>> len(hit)
+        2
+        >>> len(filtered_hit)
+        1
+        >>> print(filtered_hit)
+        Query: 33211
+               mir_1
+          Hit: gi|301171322|ref|NR_035857.1| (86)
+               Pan troglodytes microRNA mir-520c (MIR520C), microRNA
+         HSPs: ----  --------  ---------  ------  ---------------  ---------------------
+                  #   E-value  Bit score    Span      Query range              Hit range
+               ----  --------  ---------  ------  ---------------  ---------------------
+                  0   8.9e-20     100.47      60           [1:61]                [13:73]
+
+    There are also other methods which are counterparts of Python lists' methods
+    with the same names: ``append``, ``index``, ``pop``, and ``sort``. Consult their
+    respective documentations for more details and examples of their usage.
+
+    """
+
+    # attributes we don't want to transfer when creating a new Hit class
+    # from this one
+    _NON_STICKY_ATTRS = ("_items",)
+
+    def __init__(self, hsps=(), id=None, query_id=None):
+        """Initialize a Hit object.
+
+        :param hsps: HSP objects contained in the Hit object
+        :type hsps: iterable yielding HSP
+        :param id: hit ID
+        :type id: string
+        :param query_id: query ID
+        :type query_id: string
+
+        If multiple HSP objects are used for initialization, they must all
+        have the same ``query_id``, ``query_description``, ``hit_id``, and
+        ``hit_description`` properties.
+        """
+        # default attribute values
+        self._id = id
+        self._id_alt = []
+        self._query_id = query_id
+        self._description = None
+        self._description_alt = []
+        self._query_description = None
+        self.attributes = {}
+        self.dbxrefs = []
+
+        # TODO - Move this into the for look below in case
+        # hsps is a single use iterator?
+        for attr in ("query_id", "query_description", "hit_id", "hit_description"):
+            # HACK: setting the if clause to '> 1' allows for empty hit objects.
+            # This makes it easier to work with file formats with unpredictable
+            # hit-hsp ordering. The empty hit object itself is nonfunctional,
+            # however, since all its cascading properties are empty.
+            if len({getattr(hsp, attr) for hsp in hsps}) > 1:
+                raise ValueError(
+                    "Hit object can not contain HSPs with more than one %s." % attr
+                )
+
+        self._items = []
+        for hsp in hsps:
+            # validate each HSP
+            self._validate_hsp(hsp)
+            # and store it them as an instance attribute
+            self.append(hsp)
+
+    def __repr__(self):
+        """Return string representation of Hit object."""
+        return "Hit(id=%r, query_id=%r, %r hsps)" % (self.id, self.query_id, len(self))
+
+    def __iter__(self):
+        """Iterate over hsps."""
+        return iter(self.hsps)
+
+    def __len__(self):
+        """Return number of hsps."""
+        return len(self.hsps)
+
+    def __bool__(self):
+        """Return True if there are hsps."""
+        return bool(self.hsps)
+
+    def __contains__(self, hsp):
+        """Return True if hsp in items."""
+        return hsp in self._items
+
+    def __str__(self):
+        """Return a human readable summary of the Hit object."""
+        lines = []
+
+        # set query id line
+        qid_line = "Query: %s" % self.query_id
+        lines.append(qid_line)
+        if self.query_description:
+            line = "       %s" % self.query_description
+            line = line[:77] + "..." if len(line) > 80 else line
+            lines.append(line)
+
+        # set hit id line
+        hid_line = "  Hit: %s" % self.id
+        try:
+            seq_len = self.seq_len
+        except AttributeError:
+            pass
+        else:
+            hid_line += " (%i)" % seq_len
+        lines.append(hid_line)
+        if self.description:
+            line = "       %s" % self.description
+            line = line[:77] + "..." if len(line) > 80 else line
+            lines.append(line)
+
+        # set attributes lines
+        for key, value in sorted(self.attributes.items()):
+            lines.append(" %s: %s" % (key, value))
+
+        # set dbxrefs line
+        if self.dbxrefs:
+            lines.append("Database cross-references: " + ", ".join(self.dbxrefs))
+
+        # set hsp line and table
+        if not self.hsps:
+            lines.append(" HSPs: ?")
+        else:
+            lines.append(
+                " HSPs: %s  %s  %s  %s  %s  %s"
+                % ("-" * 4, "-" * 8, "-" * 9, "-" * 6, "-" * 15, "-" * 21)
+            )
+            pattern = "%11s  %8s  %9s  %6s  %15s  %21s"
+            lines.append(
+                pattern
+                % ("#", "E-value", "Bit score", "Span", "Query range", "Hit range")
+            )
+            lines.append(
+                pattern % ("-" * 4, "-" * 8, "-" * 9, "-" * 6, "-" * 15, "-" * 21)
+            )
+            for idx, hsp in enumerate(self.hsps):
+                # evalue
+                evalue = getattr_str(hsp, "evalue", fmt="%.2g")
+                # bitscore
+                bitscore = getattr_str(hsp, "bitscore", fmt="%.2f")
+                # alignment length
+                aln_span = getattr_str(hsp, "aln_span")
+                # query region
+                query_start = getattr_str(hsp, "query_start")
+                query_end = getattr_str(hsp, "query_end")
+                query_range = "[%s:%s]" % (query_start, query_end)
+                # max column length is 18
+                query_range = (
+                    query_range[:13] + "~]" if len(query_range) > 15 else query_range
+                )
+                # hit region
+                hit_start = getattr_str(hsp, "hit_start")
+                hit_end = getattr_str(hsp, "hit_end")
+                hit_range = "[%s:%s]" % (hit_start, hit_end)
+                hit_range = hit_range[:19] + "~]" if len(hit_range) > 21 else hit_range
+                # append the hsp row
+                lines.append(
+                    pattern % (idx, evalue, bitscore, aln_span, query_range, hit_range)
+                )
+
+        return "\n".join(lines)
+
+    def __getitem__(self, idx):
+        """Return the HSP object at the given index."""
+        # if key is slice, return a new Hit instance
+        if isinstance(idx, slice):
+            obj = self.__class__(self.hsps[idx])
+            self._transfer_attrs(obj)
+            return obj
+        return self._items[idx]
+
+    def __setitem__(self, idx, hsps):
+        """Assign hsps to index idx."""
+        # handle case if hsps is a list of hsp
+        if isinstance(hsps, (list, tuple)):
+            for hsp in hsps:
+                self._validate_hsp(hsp)
+        else:
+            self._validate_hsp(hsps)
+
+        self._items[idx] = hsps
+
+    def __delitem__(self, idx):
+        """Delete item of index idx."""
+        del self._items[idx]
+
+    # hsp properties #
+    def _validate_hsp(self, hsp):
+        """Validate an HSP object (PRIVATE).
+
+        Valid HSP objects have the same hit_id as the Hit object ID and the
+        same query_id as the Hit object's query_id.
+
+        """
+        if not isinstance(hsp, HSP):
+            raise TypeError("Hit objects can only contain HSP objects.")
+        # HACK: to make validation during __init__ work
+        if self._items:
+            if self.id is not None:
+                if hsp.hit_id != self.id:
+                    raise ValueError(
+                        "Expected HSP with hit ID %r, found %r instead."
+                        % (self.id, hsp.hit_id)
+                    )
+            else:
+                self.id = hsp.hit_id
+
+            if self.description is not None:
+                if hsp.hit_description != self.description:
+                    raise ValueError(
+                        "Expected HSP with hit description %r, found %r instead."
+                        % (self.description, hsp.hit_description)
+                    )
+            else:
+                self.description = hsp.hit_description
+
+            if self.query_id is not None:
+                if hsp.query_id != self.query_id:
+                    raise ValueError(
+                        "Expected HSP with query ID %r, found %r instead."
+                        % (self.query_id, hsp.query_id)
+                    )
+            else:
+                self.query_id = hsp.query_id
+
+            if self.query_description is not None:
+                if hsp.query_description != self.query_description:
+                    raise ValueError(
+                        "Expected HSP with query description %r, found %r instead."
+                        % (self.query_description, hsp.query_description)
+                    )
+            else:
+                self.query_description = hsp.query_description
+
+    # properties #
+    description = optionalcascade(
+        "_description", "hit_description", """Hit description"""
+    )
+    query_description = optionalcascade(
+        "_query_description",
+        "query_description",
+        """Description of the query that produced the hit""",
+    )
+    id = optionalcascade("_id", "hit_id", """Hit ID string.""")
+    query_id = optionalcascade(
+        "_query_id", "query_id", """ID string of the query that produced the hit"""
+    )
+    # returns all hsps
+    hsps = allitems(doc="""HSP objects contained in the Hit""")
+
+    @property
+    def id_all(self):
+        """Alternative ID(s) of the Hit."""
+        return [self.id] + self._id_alt
+
+    @property
+    def description_all(self):
+        """Alternative descriptions of the Hit."""
+        return [self.description] + self._description_alt
+
+    @property
+    def fragments(self):
+        """Access the HSPFragment objects contained in the Hit."""
+        return list(chain(*self._items))
+
+    # public methods #
+    def append(self, hsp):
+        """Add a HSP object to the end of Hit.
+
+        Parameters
+        hsp -- HSP object to append.
+
+        Any HSP object appended must have the same ``hit_id`` property as the
+        Hit object's ``id`` property and the same ``query_id`` property as the
+        Hit object's ``query_id`` property.
+
+        """
+        self._validate_hsp(hsp)
+        self._items.append(hsp)
+
+    def filter(self, func=None):
+        """Create new Hit object whose HSP objects pass the filter function.
+
+        :param func: function for filtering
+        :type func: callable, accepts HSP, returns bool
+
+        ``filter`` is analogous to Python's built-in ``filter`` function, except
+        that instead of returning a list it returns a ``Hit`` object. Here is an
+        example of using ``filter`` to select for HSPs having bitscores bigger
+        than 60::
+
+            >>> from Bio import SearchIO
+            >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+            >>> hit = qresult[3]
+            >>> evalue_filter = lambda hsp: hsp.bitscore > 60
+            >>> filtered_hit = hit.filter(evalue_filter)
+            >>> len(hit)
+            2
+            >>> len(filtered_hit)
+            1
+            >>> print(filtered_hit)
+            Query: 33211
+                   mir_1
+              Hit: gi|301171322|ref|NR_035857.1| (86)
+                   Pan troglodytes microRNA mir-520c (MIR520C), microRNA
+             HSPs: ----  --------  ---------  ------  ---------------  ---------------------
+                      #   E-value  Bit score    Span      Query range              Hit range
+                   ----  --------  ---------  ------  ---------------  ---------------------
+                      0   8.9e-20     100.47      60           [1:61]                [13:73]
+
+        """
+        hsps = list(filter(func, self.hsps))
+        if hsps:
+            obj = self.__class__(hsps)
+            self._transfer_attrs(obj)
+            return obj
+
+    def index(self, hsp):
+        """Return the index of a given HSP object, zero-based.
+
+        :param hsp: object to look up
+        :type hsp: HSP
+
+        """
+        return self._items.index(hsp)
+
+    def map(self, func=None):
+        """Create new Hit object, mapping the given function to its HSPs.
+
+        :param func: function for mapping
+        :type func: callable, accepts HSP, returns HSP
+
+        ``map`` is analogous to Python's built-in ``map`` function. It is applied to
+        all HSPs contained in the Hit object and returns a new Hit object.
+
+        """
+        if func is not None:
+            hsps = [func(x) for x in self.hsps[:]]  # this creates a shallow copy
+        else:
+            hsps = self.hsps[:]
+        if hsps:
+            obj = self.__class__(hsps)
+            self._transfer_attrs(obj)
+            return obj
+
+    def pop(self, index=-1):
+        """Remove and returns the HSP object at the specified index.
+
+        :param index: index of HSP object to pop
+        :type index: int
+
+        """
+        return self._items.pop(index)
+
+    def sort(self, key=None, reverse=False, in_place=True):
+        """Sort the HSP objects.
+
+        :param key: sorting function
+        :type key: callable, accepts HSP, returns key for sorting
+        :param reverse: whether to reverse sorting results or no
+        :type reverse: bool
+        :param in_place: whether to do in-place sorting or no
+        :type in_place: bool
+
+        ``sort`` defaults to sorting in-place, to mimick Python's ``list.sort``
+        method. If you set the ``in_place`` argument to False, it will treat
+        return a new, sorted Hit object and keep the initial one unsorted
+
+        """
+        if in_place:
+            self._items.sort(key=key, reverse=reverse)
+        else:
+            hsps = self.hsps[:]
+            hsps.sort(key=key, reverse=reverse)
+            obj = self.__class__(hsps)
+            self._transfer_attrs(obj)
+            return obj
+
+
+# if not used as a module, run the doctest
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SearchIO/_model/hsp.py b/code/lib/Bio/SearchIO/_model/hsp.py
new file mode 100644
index 0000000..ce15ee2
--- /dev/null
+++ b/code/lib/Bio/SearchIO/_model/hsp.py
@@ -0,0 +1,1230 @@
+# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SearchIO objects to model high scoring regions between query and hit."""
+
+import warnings
+from operator import ge, le
+
+from Bio import BiopythonWarning
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from Bio.SearchIO._utils import (
+    singleitem,
+    allitems,
+    fullcascade,
+    fragcascade,
+    getattr_str,
+)
+
+from ._base import _BaseHSP
+
+
+class HSP(_BaseHSP):
+    """Class representing high-scoring region(s) between query and hit.
+
+    HSP (high-scoring pair) objects are contained by Hit objects (see Hit).
+    In most cases, HSP objects store the bulk of the statistics and results
+    (e.g. e-value, bitscores, query sequence, etc.) produced by a search
+    program.
+
+    Depending on the search output file format, a given HSP will contain one
+    or more HSPFragment object(s). Examples of search programs that produce HSP
+    with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as
+    BLAT or Exonerate may produce HSPs containing more than one HSPFragment.
+    However, their native terminologies may differ: in BLAT these fragments
+    are called 'blocks' while in Exonerate they are called exons or NER.
+
+    Here are examples from each type of HSP. The first one comes from a BLAST
+    search::
+
+        >>> from Bio import SearchIO
+        >>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+        >>> blast_hsp = blast_qresult[1][0]     # the first HSP from the second hit
+        >>> blast_hsp
+        HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments)
+        >>> print(blast_hsp)
+              Query: 33211 mir_1
+                Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ...
+        Query range: [1:61] (1)
+          Hit range: [0:60] (1)
+        Quick stats: evalue 1.7e-22; bitscore 109.49
+          Fragments: 1 (60 columns)
+             Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
+                     ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
+               Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
+
+    For HSPs with a single HSPFragment, you can invoke ``print`` on it and see the
+    underlying sequence alignment, if it exists. This is not the case for HSPs
+    with more than one HSPFragment. Below is an example, using an HSP from a
+    BLAT search. Invoking ``print`` on these HSPs will instead show a table of the
+    HSPFragment objects it contains::
+
+        >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True)
+        >>> blat_hsp = blat_qresult[1][0]       # the first HSP from the second hit
+        >>> blat_hsp
+        HSP(hit_id='chr11', query_id='blat_1', 2 fragments)
+        >>> print(blat_hsp)
+              Query: blat_1 
+                Hit: chr11 
+        Query range: [42:67] (-1)
+          Hit range: [59018929:59018955] (1)
+        Quick stats: evalue ?; bitscore ?
+          Fragments: ---  --------------  ----------------------  ----------------------
+                       #            Span             Query range               Hit range
+                     ---  --------------  ----------------------  ----------------------
+                       0               6                 [61:67]     [59018929:59018935]
+                       1              16                 [42:58]     [59018939:59018955]
+
+    Notice that in HSPs with more than one HSPFragments, the HSP's ``query_range``
+    ``hit_range`` properties encompasses all fragments it contains.
+
+    You can check whether an HSP has more than one HSPFragments or not using the
+    ``is_fragmented`` property::
+
+        >>> blast_hsp.is_fragmented
+        False
+        >>> blat_hsp.is_fragmented
+        True
+
+    Since HSP objects are also containers similar to Python lists, you can
+    access a single fragment in an HSP using its integer index::
+
+        >>> blat_fragment = blat_hsp[0]
+        >>> print(blat_fragment)
+              Query: blat_1 
+                Hit: chr11 
+        Query range: [61:67] (-1)
+          Hit range: [59018929:59018935] (1)
+          Fragments: 1 (6 columns)
+             Query - tatagt
+               Hit - tatagt
+
+    This applies to HSPs objects with a single fragment as well::
+
+        >>> blast_fragment = blast_hsp[0]
+        >>> print(blast_fragment)
+              Query: 33211 mir_1
+                Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ...
+        Query range: [1:61] (1)
+          Hit range: [0:60] (1)
+          Fragments: 1 (60 columns)
+             Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
+                     ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
+               Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
+
+    Regardless of the search output file format, HSP objects provide the
+    properties listed below. These properties always return values in a list,
+    due to the HSP object itself being a list-like container. However, for
+    HSP objects with a single HSPFragment, shortcut properties that fetches
+    the item from the list are also provided.
+
+    +----------------------+---------------------+-----------------------------+
+    | Property             | Shortcut            | Value                       |
+    +======================+=====================+=============================+
+    | aln_all              | aln                 | HSP alignments as           |
+    |                      |                     | MultipleSeqAlignment object |
+    +----------------------+---------------------+-----------------------------+
+    | aln_annotation_all   | aln_annotation      | dictionary of annotation(s) |
+    |                      |                     | of all fragments' alignments|
+    +----------------------+---------------------+-----------------------------+
+    | fragments            | fragment            | HSPFragment objects         |
+    +----------------------+---------------------+-----------------------------+
+    | hit_all              | hit                 | hit sequence as SeqRecord   |
+    |                      |                     | objects                     |
+    +----------------------+---------------------+-----------------------------+
+    | hit_features_all     | hit_features        | SeqFeatures of all hit      |
+    |                      |                     | fragments                   |
+    +----------------------+---------------------+-----------------------------+
+    | hit_start_all        | hit_start*          | start coordinates of the    |
+    |                      |                     | hit fragments               |
+    +----------------------+---------------------+-----------------------------+
+    | hit_end_all          | hit_end*            | end coordinates of the hit  |
+    |                      |                     | fragments                   |
+    +----------------------+---------------------+-----------------------------+
+    | hit_span_all         | hit_span*           | sizes of each hit fragments |
+    +----------------------+---------------------+-----------------------------+
+    | hit_strand_all       | hit_strand          | strand orientations of the  |
+    |                      |                     | hit fragments               |
+    +----------------------+---------------------+-----------------------------+
+    | hit_frame_all        | hit_frame           | reading frames of the hit   |
+    |                      |                     | fragments                   |
+    +----------------------+---------------------+-----------------------------+
+    | hit_range_all        | hit_range           | tuples of start and end     |
+    |                      |                     | coordinates of each hit     |
+    |                      |                     | fragment                    |
+    +----------------------+---------------------+-----------------------------+
+    | query_all            | query               | query sequence as SeqRecord |
+    |                      |                     | object                      |
+    +----------------------+---------------------+-----------------------------+
+    | query_features_all   | query_features      | SeqFeatures of all query    |
+    |                      |                     | fragments                   |
+    +----------------------+---------------------+-----------------------------+
+    | query_start_all      | query_start*        | start coordinates of the    |
+    |                      |                     | fragments                   |
+    +----------------------+---------------------+-----------------------------+
+    | query_end_all        | query_end*          | end coordinates of the      |
+    |                      |                     | query fragments             |
+    +----------------------+---------------------+-----------------------------+
+    | query_span_all       | query_span*         | sizes of each query         |
+    |                      |                     | fragments                   |
+    +----------------------+---------------------+-----------------------------+
+    | query_strand_all     | query_strand        | strand orientations of the  |
+    |                      |                     | query fragments             |
+    +----------------------+---------------------+-----------------------------+
+    | query_frame_all      | query_frame         | reading frames of the query |
+    |                      |                     | fragments                   |
+    +----------------------+---------------------+-----------------------------+
+    | query_range_all      | query_range         | tuples of start and end     |
+    |                      |                     | coordinates of each query   |
+    |                      |                     | fragment                    |
+    +----------------------+---------------------+-----------------------------+
+
+    For all types of HSP objects, the property will return the values in a list.
+    Shorcuts are only applicable for HSPs with one fragment. Except the ones
+    noted, if they are used on an HSP with more than one fragments, an exception
+    will be raised.
+
+    For properties that may be used in HSPs with multiple or single fragments
+    (``*_start``, ``*_end``, and ``*_span`` properties), their interpretation depends
+    on how many fragment the HSP has:
+
+    +------------+---------------------------------------------------+
+    | Property   | Value                                             |
+    +============+===================================================+
+    | hit_start  | smallest coordinate value of all hit fragments    |
+    +------------+---------------------------------------------------+
+    | hit_end    | largest coordinate value of all hit fragments     |
+    +------------+---------------------------------------------------+
+    | hit_span   | difference between ``hit_start`` and ``hit_end``  |
+    +------------+---------------------------------------------------+
+    | query_start| smallest coordinate value of all query fragments  |
+    +------------+---------------------------------------------------+
+    | query_end  | largest coordinate value of all query fragments   |
+    +------------+---------------------------------------------------+
+    | query_span | difference between ``query_start`` and            |
+    |            | ``query_end``                                     |
+    +------------+---------------------------------------------------+
+
+    In addition to the objects listed above, HSP objects also provide the
+    following properties and/or attributes:
+
+    +--------------------+------------------------------------------------------+
+    | Property           | Value                                                |
+    +====================+======================================================+
+    | aln_span           | total number of residues in all HSPFragment objects  |
+    +--------------------+------------------------------------------------------+
+    | molecule_type      | molecule_type of the hit and query SeqRecord objects |
+    +--------------------+------------------------------------------------------+
+    | is_fragmented      | boolean, whether there are multiple fragments or not |
+    +--------------------+------------------------------------------------------+
+    | hit_id             | ID of the hit sequence                               |
+    +--------------------+------------------------------------------------------+
+    | hit_description    | description of the hit sequence                      |
+    +--------------------+------------------------------------------------------+
+    | hit_inter_ranges   | list of hit sequence coordinates of the regions      |
+    |                    | between fragments                                    |
+    +--------------------+------------------------------------------------------+
+    | hit_inter_spans    | list of lengths of the regions between hit fragments |
+    +--------------------+------------------------------------------------------+
+    | output_index       | 0-based index for storing the order by which the HSP |
+    |                    | appears in the output file (default: -1).            |
+    +--------------------+------------------------------------------------------+
+    | query_id           | ID of the query sequence                             |
+    +--------------------+------------------------------------------------------+
+    | query_description  | description of the query sequence                    |
+    +--------------------+------------------------------------------------------+
+    | query_inter_ranges | list of query sequence coordinates of the regions    |
+    |                    | between fragments                                    |
+    +--------------------+------------------------------------------------------+
+    | query_inter_spans  | list of lengths of the regions between query         |
+    |                    | fragments                                            |
+    +--------------------+------------------------------------------------------+
+
+    .. [1] may be used in HSPs with multiple fragments
+
+    """
+
+    # attributes we don't want to transfer when creating a new Hit class
+    # from this one
+    _NON_STICKY_ATTRS = ("_items",)
+
+    def __init__(self, fragments=(), output_index=-1):
+        """Initialize an HSP object.
+
+        :param fragments: fragments contained in the HSP object
+        :type fragments: iterable yielding HSPFragment
+        :param output_index: optional index / ordering of the HSP fragment in
+            the original input file.
+        :type output_index: integer
+
+        HSP objects must be initialized with a list containing at least one
+        HSPFragment object. If multiple HSPFragment objects are used for
+        initialization, they must all have the same ``query_id``,
+        ``query_description``, ``hit_id``, ``hit_description``, and
+        ``molecule_type`` properties.
+
+        """
+        if not fragments:
+            raise ValueError("HSP objects must have at least one HSPFragment object.")
+        # TODO - Move this into the for look in case hsps is a single use
+        # iterable?
+        # check that all fragments contain the same IDs, descriptions,
+        # molecule_type
+        for attr in (
+            "query_id",
+            "query_description",
+            "hit_id",
+            "hit_description",
+            "molecule_type",
+        ):
+            if len({getattr(frag, attr) for frag in fragments}) != 1:
+                raise ValueError(
+                    "HSP object can not contain fragments with more than one %s." % attr
+                )
+
+        self.output_index = output_index
+        self._items = []
+        for fragment in fragments:
+            self._validate_fragment(fragment)
+            self._items.append(fragment)
+
+    def __repr__(self):
+        """Return string representation of HSP object."""
+        return "%s(hit_id=%r, query_id=%r, %r fragments)" % (
+            self.__class__.__name__,
+            self.hit_id,
+            self.query_id,
+            len(self),
+        )
+
+    def __iter__(self):
+        """Iterate over HSP items."""
+        return iter(self._items)
+
+    def __contains__(self, fragment):
+        """Return True if HSPFragment is on HSP items."""
+        return fragment in self._items
+
+    def __len__(self):
+        """Return number of HSPs items."""
+        return len(self._items)
+
+    def __bool__(self):
+        """Return True if it has HSPs."""
+        return bool(self._items)
+
+    def __str__(self):
+        """Return a human readable summary of the HSP object."""
+        lines = []
+        # set hsp info line
+        statline = []
+        # evalue
+        evalue = getattr_str(self, "evalue", fmt="%.2g")
+        statline.append("evalue " + evalue)
+        # bitscore
+        bitscore = getattr_str(self, "bitscore", fmt="%.2f")
+        statline.append("bitscore " + bitscore)
+        lines.append("Quick stats: " + "; ".join(statline))
+
+        if len(self.fragments) == 1:
+            return "\n".join(
+                [self._str_hsp_header(), "\n".join(lines), self.fragments[0]._str_aln()]
+            )
+        else:
+            lines.append(
+                "  Fragments: %s  %s  %s  %s" % ("-" * 3, "-" * 14, "-" * 22, "-" * 22)
+            )
+            pattern = "%16s  %14s  %22s  %22s"
+            lines.append(pattern % ("#", "Span", "Query range", "Hit range"))
+            lines.append(pattern % ("-" * 3, "-" * 14, "-" * 22, "-" * 22))
+            for idx, block in enumerate(self.fragments):
+                # set hsp line and table
+                # alignment span
+                aln_span = getattr_str(block, "aln_span")
+                # query region
+                query_start = getattr_str(block, "query_start")
+                query_end = getattr_str(block, "query_end")
+                query_range = "[%s:%s]" % (query_start, query_end)
+                # max column length is 20
+                query_range = (
+                    query_range[:20] + "~]" if len(query_range) > 22 else query_range
+                )
+                # hit region
+                hit_start = getattr_str(block, "hit_start")
+                hit_end = getattr_str(block, "hit_end")
+                hit_range = "[%s:%s]" % (hit_start, hit_end)
+                hit_range = hit_range[:20] + "~]" if len(hit_range) > 22 else hit_range
+                # append the hsp row
+                lines.append(pattern % (str(idx), aln_span, query_range, hit_range))
+
+            return self._str_hsp_header() + "\n" + "\n".join(lines)
+
+    def __getitem__(self, idx):
+        """Return object of index idx."""
+        # if key is slice, return a new HSP instance
+        if isinstance(idx, slice):
+            obj = self.__class__(self._items[idx])
+            self._transfer_attrs(obj)
+            return obj
+        return self._items[idx]
+
+    def __setitem__(self, idx, fragments):
+        """Set an item of index idx with the given fragments."""
+        # handle case if hsps is a list of hsp
+        if isinstance(fragments, (list, tuple)):
+            for fragment in fragments:
+                self._validate_fragment(fragment)
+        else:
+            self._validate_fragment(fragments)
+
+        self._items[idx] = fragments
+
+    def __delitem__(self, idx):
+        """Delete item of index idx."""
+        # note that this may result in an empty HSP object, which should be
+        # invalid
+        del self._items[idx]
+
+    def _validate_fragment(self, fragment):
+        if not isinstance(fragment, HSPFragment):
+            raise TypeError("HSP objects can only contain HSPFragment objects.")
+        # HACK: to make validation during __init__ work
+        if self._items:
+            if fragment.hit_id != self.hit_id:
+                raise ValueError(
+                    "Expected HSPFragment with hit ID %r, found %r instead."
+                    % (self.id, fragment.hit_id)
+                )
+
+            if fragment.hit_description != self.hit_description:
+                raise ValueError(
+                    "Expected HSPFragment with hit description %r, found %r instead."
+                    % (self.description, fragment.hit_description)
+                )
+
+            if fragment.query_id != self.query_id:
+                raise ValueError(
+                    "Expected HSPFragment with query ID %r, found %r instead."
+                    % (self.query_id, fragment.query_id)
+                )
+
+            if fragment.query_description != self.query_description:
+                raise ValueError(
+                    "Expected HSP with query description %r, found %r instead."
+                    % (self.query_description, fragment.query_description)
+                )
+
+    def _aln_span_get(self):
+        # length of all alignments
+        # alignment span can be its own attribute, or computed from
+        # query / hit length
+        return sum(frg.aln_span for frg in self.fragments)
+
+    aln_span = property(
+        fget=_aln_span_get, doc="Total number of columns in all HSPFragment objects."
+    )
+
+    # coordinate properties #
+    def _get_coords(self, seq_type, coord_type):
+        assert seq_type in ("hit", "query")
+        assert coord_type in ("start", "end")
+        coord_name = "%s_%s" % (seq_type, coord_type)
+        coords = [getattr(frag, coord_name) for frag in self.fragments]
+        if None in coords:
+            warnings.warn(
+                "'None' exist in %s coordinates; ignored" % (coord_name),
+                BiopythonWarning,
+            )
+        return coords
+
+    def _hit_start_get(self):
+        return min(self._get_coords("hit", "start"))
+
+    hit_start = property(
+        fget=_hit_start_get, doc="Smallest coordinate value of all hit fragments."
+    )
+
+    def _query_start_get(self):
+        return min(self._get_coords("query", "start"))
+
+    query_start = property(
+        fget=_query_start_get, doc="Smallest coordinate value of all query fragments."
+    )
+
+    def _hit_end_get(self):
+        return max(self._get_coords("hit", "end"))
+
+    hit_end = property(
+        fget=_hit_end_get, doc="Largest coordinate value of all hit fragments."
+    )
+
+    def _query_end_get(self):
+        return max(self._get_coords("query", "end"))
+
+    query_end = property(
+        fget=_query_end_get, doc="Largest coordinate value of all hit fragments."
+    )
+
+    # coordinate-dependent properties #
+    def _hit_span_get(self):
+        try:
+            return self.hit_end - self.hit_start
+        except TypeError:  # triggered if any of the coordinates are None
+            return None
+
+    hit_span = property(
+        fget=_hit_span_get, doc="The number of hit residues covered by the HSP."
+    )
+
+    def _query_span_get(self):
+        try:
+            return self.query_end - self.query_start
+        except TypeError:  # triggered if any of the coordinates are None
+            return None
+
+    query_span = property(
+        fget=_query_span_get, doc="The number of query residues covered by the HSP."
+    )
+
+    def _hit_range_get(self):
+        return (self.hit_start, self.hit_end)
+
+    hit_range = property(
+        fget=_hit_range_get, doc="Tuple of HSP hit start and end coordinates."
+    )
+
+    def _query_range_get(self):
+        return (self.query_start, self.query_end)
+
+    query_range = property(
+        fget=_query_range_get, doc="Tuple of HSP query start and end coordinates."
+    )
+
+    def _inter_ranges_get(self, seq_type):
+        # this property assumes that there are no mixed strands in a hit/query
+        assert seq_type in ("query", "hit")
+        strand = getattr(self, "%s_strand_all" % seq_type)[0]
+        coords = getattr(self, "%s_range_all" % seq_type)
+        # determine function used to set inter range
+        # start and end coordinates, given two pairs
+        # of fragment start and end coordinates
+        if strand == -1:
+            startfunc, endfunc = min, max
+        else:
+            startfunc, endfunc = max, min
+        inter_coords = []
+        for idx, coord in enumerate(coords[:-1]):
+            start = startfunc(coords[idx])
+            end = endfunc(coords[idx + 1])
+            inter_coords.append((min(start, end), max(start, end)))
+
+        return inter_coords
+
+    def _hit_inter_ranges_get(self):
+        return self._inter_ranges_get("hit")
+
+    hit_inter_ranges = property(
+        fget=_hit_inter_ranges_get,
+        doc="Hit sequence coordinates of the regions between fragments.",
+    )
+
+    def _query_inter_ranges_get(self):
+        return self._inter_ranges_get("query")
+
+    query_inter_ranges = property(
+        fget=_query_inter_ranges_get,
+        doc="Query sequence coordinates of the regions between fragments.",
+    )
+
+    def _inter_spans_get(self, seq_type):
+        assert seq_type in ("query", "hit")
+        attr_name = "%s_inter_ranges" % seq_type
+        return [coord[1] - coord[0] for coord in getattr(self, attr_name)]
+
+    def _hit_inter_spans_get(self):
+        return self._inter_spans_get("hit")
+
+    hit_inter_spans = property(
+        fget=_hit_inter_spans_get, doc="Lengths of regions between hit fragments."
+    )
+
+    def _query_inter_spans_get(self):
+        return self._inter_spans_get("query")
+
+    query_inter_spans = property(
+        fget=_query_inter_spans_get, doc="Lengths of regions between query fragments."
+    )
+
+    # shortcuts for fragments' properties #
+
+    # bool check if there's more than one fragments
+    is_fragmented = property(
+        lambda self: len(self) > 1,
+        doc="Whether the HSP has more than one HSPFragment objects.",
+    )
+
+    # first item properties with setters
+    hit_description = fullcascade(
+        "hit_description", doc="Description of the hit sequence."
+    )
+
+    query_description = fullcascade(
+        "query_description", doc="Description of the query sequence."
+    )
+
+    hit_id = fullcascade("hit_id", doc="ID of the hit sequence.")
+
+    query_id = fullcascade("query_id", doc="ID of the query sequence.")
+
+    molecule_type = fullcascade(
+        "molecule_type", doc="molecule_type of the hit and query SeqRecord objects."
+    )
+
+    # properties for single-fragment HSPs
+    fragment = singleitem(doc="HSPFragment object, first fragment.")
+
+    hit = singleitem("hit", doc="Hit sequence as a SeqRecord object, first fragment.")
+
+    query = singleitem(
+        "query", doc="Query sequence as a SeqRecord object, first fragment."
+    )
+
+    aln = singleitem(
+        "aln", doc="Alignment of the first fragment as a MultipleSeqAlignment object."
+    )
+
+    aln_annotation = singleitem(
+        "aln_annotation",
+        doc="Dictionary of annotation(s) of the first fragment's alignment.",
+    )
+
+    hit_features = singleitem(
+        "hit_features", doc="Hit sequence features, first fragment."
+    )
+
+    query_features = singleitem(
+        "query_features", doc="Query sequence features, first fragment."
+    )
+
+    hit_strand = singleitem("hit_strand", doc="Hit strand orientation, first fragment.")
+
+    query_strand = singleitem(
+        "query_strand", doc="Query strand orientation, first fragment."
+    )
+
+    hit_frame = singleitem(
+        "hit_frame", doc="Hit sequence reading frame, first fragment."
+    )
+
+    query_frame = singleitem(
+        "query_frame", doc="Query sequence reading frame, first fragment."
+    )
+
+    # properties for multi-fragment HSPs
+    fragments = allitems(doc="List of all HSPFragment objects.")
+
+    hit_all = allitems(
+        "hit", doc="List of all fragments' hit sequences as SeqRecord objects."
+    )
+
+    query_all = allitems(
+        "query", doc="List of all fragments' query sequences as SeqRecord objects."
+    )
+
+    aln_all = allitems(
+        "aln", doc="List of all fragments' alignments as MultipleSeqAlignment objects."
+    )
+
+    aln_annotation_all = allitems(
+        "aln_annotation",
+        doc="Dictionary of annotation(s) of all fragments' alignments.",
+    )
+
+    hit_features_all = allitems(
+        "hit_features", doc="List of all hit sequence features."
+    )
+
+    query_features_all = allitems(
+        "query_features", doc="List of all query sequence features."
+    )
+
+    hit_strand_all = allitems(
+        "hit_strand", doc="List of all fragments' hit sequence strands."
+    )
+
+    query_strand_all = allitems(
+        "query_strand", doc="List of all fragments' query sequence strands"
+    )
+
+    hit_frame_all = allitems(
+        "hit_frame", doc="List of all fragments' hit sequence reading frames."
+    )
+
+    query_frame_all = allitems(
+        "query_frame", doc="List of all fragments' query sequence reading frames."
+    )
+
+    hit_start_all = allitems(
+        "hit_start", doc="List of all fragments' hit start coordinates."
+    )
+
+    query_start_all = allitems(
+        "query_start", doc="List of all fragments' query start coordinates."
+    )
+
+    hit_end_all = allitems("hit_end", doc="List of all fragments' hit end coordinates.")
+
+    query_end_all = allitems(
+        "query_end", doc="List of all fragments' query end coordinates."
+    )
+
+    hit_span_all = allitems("hit_span", doc="List of all fragments' hit sequence size.")
+
+    query_span_all = allitems(
+        "query_span", doc="List of all fragments' query sequence size."
+    )
+
+    hit_range_all = allitems(
+        "hit_range", doc="List of all fragments' hit start and end coordinates."
+    )
+
+    query_range_all = allitems(
+        "query_range", doc="List of all fragments' query start and end coordinates."
+    )
+
+
+class HSPFragment(_BaseHSP):
+    """Class representing a contiguous alignment of hit-query sequence.
+
+    HSPFragment forms the core of any parsed search output file. Depending on
+    the search output file format, it may contain the actual query and/or hit
+    sequences that produces the search hits. These sequences are stored as
+    SeqRecord objects (see SeqRecord):
+
+    >>> from Bio import SearchIO
+    >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+    >>> fragment = qresult[0][0][0]   # first hit, first hsp, first fragment
+    >>> print(fragment)
+          Query: 33211 mir_1
+            Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520...
+    Query range: [0:61] (1)
+      Hit range: [0:61] (1)
+      Fragments: 1 (61 columns)
+         Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
+                 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
+           Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
+
+    # the query sequence is a SeqRecord object
+    >>> fragment.query.__class__
+    
+    >>> print(fragment.query)
+    ID: 33211
+    Name: aligned query sequence
+    Description: mir_1
+    Number of features: 0
+    /molecule_type=DNA
+    Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG')
+
+    # the hit sequence is a SeqRecord object as well
+    >>> fragment.hit.__class__
+    
+    >>> print(fragment.hit)
+    ID: gi|262205317|ref|NR_030195.1|
+    Name: aligned hit sequence
+    Description: Homo sapiens microRNA 520b (MIR520B), microRNA
+    Number of features: 0
+    /molecule_type=DNA
+    Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG')
+
+    # when both query and hit are present, we get a MultipleSeqAlignment object
+    >>> fragment.aln.__class__
+    
+    >>> print(fragment.aln)
+    Alignment with 2 rows and 61 columns
+    CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211
+    CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1|
+
+    """
+
+    def __init__(
+        self,
+        hit_id="",
+        query_id="",
+        hit=None,
+        query=None,
+        molecule_type=None,
+    ):
+        """Initialize the class."""
+        self._molecule_type = molecule_type
+        self.aln_annotation = {}
+
+        self._hit_id = hit_id
+        self._query_id = query_id
+
+        for seq_type in ("query", "hit"):
+            # query or hit attributes default attributes
+            setattr(self, "_%s_description" % seq_type, "")
+            setattr(self, "_%s_features" % seq_type, [])
+            # query or hit attributes whose default attribute is None
+            for attr in ("strand", "frame", "start", "end"):
+                setattr(self, "%s_%s" % (seq_type, attr), None)
+            # self.query or self.hit
+            if eval(seq_type):
+                setattr(self, seq_type, eval(seq_type))
+            else:
+                setattr(self, seq_type, None)
+
+    def __repr__(self):
+        """Return HSPFragment info; hit id, query id, number of columns."""
+        info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id)
+        try:
+            info += ", %i columns" % len(self)
+        except AttributeError:
+            pass
+        return "%s(%s)" % (self.__class__.__name__, info)
+
+    def __len__(self):
+        """Return alignment span."""
+        return self.aln_span
+
+    def __str__(self):
+        """Return string of HSP header and alignments."""
+        return self._str_hsp_header() + "\n" + self._str_aln()
+
+    def __getitem__(self, idx):
+        """Return object of index idx."""
+        if self.aln is not None:
+            obj = self.__class__(
+                hit_id=self.hit_id,
+                query_id=self.query_id,
+                molecule_type=self.molecule_type,
+            )
+            # transfer query and hit attributes
+            # let SeqRecord handle feature slicing, then retrieve the sliced
+            # features into the sliced HSPFragment
+            if self.query is not None:
+                obj.query = self.query[idx]
+                obj.query_features = obj.query.features
+            if self.hit is not None:
+                obj.hit = self.hit[idx]
+                obj.hit_features = obj.hit.features
+            # description, strand, frame
+            for attr in ("description", "strand", "frame"):
+                for seq_type in ("hit", "query"):
+                    attr_name = "%s_%s" % (seq_type, attr)
+                    self_val = getattr(self, attr_name)
+                    setattr(obj, attr_name, self_val)
+            # alignment annotation should be transferred, since we can compute
+            # the resulting annotation
+            obj.aln_annotation = {}
+            for key, value in self.aln_annotation.items():
+                assert len(value[idx]) == len(obj)
+                obj.aln_annotation[key] = value[idx]
+            return obj
+        else:
+            raise TypeError(
+                "Slicing for HSP objects without alignment is not supported."
+            )
+
+    def _str_aln(self):
+        lines = []
+        # alignment length
+        aln_span = getattr_str(self, "aln_span")
+        lines.append("  Fragments: 1 (%s columns)" % aln_span)
+        # sequences
+        if self.query is not None and self.hit is not None:
+            try:
+                qseq = self.query.seq
+            except AttributeError:  # query is None
+                qseq = "?"
+            try:
+                hseq = self.hit.seq
+            except AttributeError:  # hit is None
+                hseq = "?"
+
+            # similarity line
+            simil = ""
+            if "similarity" in self.aln_annotation and isinstance(
+                self.aln_annotation.get("similarity"), str
+            ):
+                simil = self.aln_annotation["similarity"]
+
+            if self.aln_span <= 67:
+                lines.append("%10s - %s" % ("Query", qseq))
+                if simil:
+                    lines.append("             %s" % simil)
+                lines.append("%10s - %s" % ("Hit", hseq))
+            else:
+                # adjust continuation character length, so we don't display
+                # the same residues twice
+                if self.aln_span - 66 > 3:
+                    cont = "~" * 3
+                else:
+                    cont = "~" * (self.aln_span - 66)
+                lines.append("%10s - %s%s%s" % ("Query", qseq[:59], cont, qseq[-5:]))
+                if simil:
+                    lines.append("             %s%s%s" % (simil[:59], cont, simil[-5:]))
+                lines.append("%10s - %s%s%s" % ("Hit", hseq[:59], cont, hseq[-5:]))
+
+        return "\n".join(lines)
+
+    # sequence properties #
+    def _set_seq(self, seq, seq_type):
+        """Check the given sequence for attribute setting (PRIVATE).
+
+        :param seq: sequence to check
+        :type seq: string or SeqRecord
+        :param seq_type: sequence type
+        :type seq_type: string, choice of 'hit' or 'query'
+
+        """
+        assert seq_type in ("hit", "query")
+        if seq is None:
+            return seq  # return immediately if seq is None
+        else:
+            if not isinstance(seq, (str, SeqRecord)):
+                raise TypeError(
+                    "%s sequence must be a string or a SeqRecord object." % seq_type
+                )
+        # check length if the opposite sequence is not None
+        opp_type = "hit" if seq_type == "query" else "query"
+        opp_seq = getattr(self, "_%s" % opp_type, None)
+        if opp_seq is not None:
+            if len(seq) != len(opp_seq):
+                raise ValueError(
+                    "Sequence lengths do not match. Expected: %r (%s); found: %r (%s)."
+                    % (len(opp_seq), opp_type, len(seq), seq_type)
+                )
+
+        seq_id = getattr(self, "%s_id" % seq_type)
+        seq_desc = getattr(self, "%s_description" % seq_type)
+        seq_feats = getattr(self, "%s_features" % seq_type)
+        seq_name = "aligned %s sequence" % seq_type
+
+        if isinstance(seq, SeqRecord):
+            seq.id = seq_id
+            seq.description = seq_desc
+            seq.name = seq_name
+            seq.features = seq_feats
+            seq.annotations["molecule_type"] = self.molecule_type
+        elif isinstance(seq, str):
+            seq = SeqRecord(
+                Seq(seq),
+                id=seq_id,
+                name=seq_name,
+                description=seq_desc,
+                features=seq_feats,
+                annotations={"molecule_type": self.molecule_type},
+            )
+
+        return seq
+
+    def _hit_get(self):
+        return self._hit
+
+    def _hit_set(self, value):
+        self._hit = self._set_seq(value, "hit")
+
+    hit = property(
+        fget=_hit_get,
+        fset=_hit_set,
+        doc="Hit sequence as a SeqRecord object, defaults to None.",
+    )
+
+    def _query_get(self):
+        return self._query
+
+    def _query_set(self, value):
+        self._query = self._set_seq(value, "query")
+
+    query = property(
+        fget=_query_get,
+        fset=_query_set,
+        doc="Query sequence as a SeqRecord object, defaults to None.",
+    )
+
+    def _aln_get(self):
+        if self.query is None and self.hit is None:
+            return None
+        if self.hit is None:
+            msa = MultipleSeqAlignment([self.query])
+        elif self.query is None:
+            msa = MultipleSeqAlignment([self.hit])
+        else:
+            msa = MultipleSeqAlignment([self.query, self.hit])
+        molecule_type = self.molecule_type
+        if molecule_type is not None:
+            msa.molecule_type = molecule_type
+        return msa
+
+    aln = property(
+        fget=_aln_get,
+        doc="Query-hit alignment as a MultipleSeqAlignment object, defaults to None.",
+    )
+
+    def _molecule_type_get(self):
+        return self._molecule_type
+
+    def _molecule_type_set(self, value):
+        self._molecule_type = value
+        try:
+            self.query.annotations["molecule_type"] = value
+        except AttributeError:
+            pass
+        try:
+            self.hit.annotations["molecule_type"] = value
+        except AttributeError:
+            pass
+
+    molecule_type = property(
+        fget=_molecule_type_get,
+        fset=_molecule_type_set,
+        doc="molecule type used in the fragment's "
+        "sequence records and alignment, defaults to None.",
+    )
+
+    def _aln_span_get(self):
+        # length of alignment (gaps included)
+        # alignment span can be its own attribute, or computed from
+        # query / hit length
+        try:
+            self._aln_span
+        except AttributeError:
+            if self.query is not None:
+                self._aln_span = len(self.query)
+            elif self.hit is not None:
+                self._aln_span = len(self.hit)
+
+        return self._aln_span
+
+    def _aln_span_set(self, value):
+        self._aln_span = value
+
+    aln_span = property(
+        fget=_aln_span_get,
+        fset=_aln_span_set,
+        doc="The number of alignment columns covered by the fragment.",
+    )
+
+    # id, description, and features properties #
+    hit_description = fragcascade("description", "hit", doc="Hit sequence description.")
+
+    query_description = fragcascade(
+        "description", "query", doc="Query sequence description."
+    )
+
+    hit_id = fragcascade("id", "hit", doc="Hit sequence ID.")
+
+    query_id = fragcascade("id", "query", doc="Query sequence ID.")
+
+    hit_features = fragcascade("features", "hit", doc="Hit sequence features.")
+
+    query_features = fragcascade("features", "query", doc="Query sequence features.")
+
+    # strand properties #
+    def _prep_strand(self, strand):
+        # follow SeqFeature's convention
+        if strand not in (-1, 0, 1, None):
+            raise ValueError("Strand should be -1, 0, 1, or None; not %r" % strand)
+        return strand
+
+    def _get_strand(self, seq_type):
+        assert seq_type in ("hit", "query")
+        strand = getattr(self, "_%s_strand" % seq_type)
+
+        if strand is None:
+            # try to compute strand from frame
+            frame = getattr(self, "%s_frame" % seq_type)
+            if frame is not None:
+                try:
+                    strand = frame // abs(frame)
+                except ZeroDivisionError:
+                    strand = 0
+                setattr(self, "%s_strand" % seq_type, strand)
+
+        return strand
+
+    def _hit_strand_get(self):
+        return self._get_strand("hit")
+
+    def _hit_strand_set(self, value):
+        self._hit_strand = self._prep_strand(value)
+
+    hit_strand = property(
+        fget=_hit_strand_get,
+        fset=_hit_strand_set,
+        doc="Hit sequence strand, defaults to None.",
+    )
+
+    def _query_strand_get(self):
+        return self._get_strand("query")
+
+    def _query_strand_set(self, value):
+        self._query_strand = self._prep_strand(value)
+
+    query_strand = property(
+        fget=_query_strand_get,
+        fset=_query_strand_set,
+        doc="Query sequence strand, defaults to None.",
+    )
+
+    # frame properties #
+    def _prep_frame(self, frame):
+        if frame not in (-3, -2, -1, 0, 1, 2, 3, None):
+            raise ValueError(
+                "Strand should be an integer between -3 and 3, or None; not %r" % frame
+            )
+        return frame
+
+    def _hit_frame_get(self):
+        return self._hit_frame
+
+    def _hit_frame_set(self, value):
+        self._hit_frame = self._prep_frame(value)
+
+    hit_frame = property(
+        fget=_hit_frame_get,
+        fset=_hit_frame_set,
+        doc="Hit sequence reading frame, defaults to None.",
+    )
+
+    def _query_frame_get(self):
+        """Get query sequence reading frame (PRIVATE)."""
+        return self._query_frame
+
+    def _query_frame_set(self, value):
+        """Set query sequence reading frame (PRIVATE)."""
+        self._query_frame = self._prep_frame(value)
+
+    query_frame = property(
+        fget=_query_frame_get,
+        fset=_query_frame_set,
+        doc="Query sequence reading frame, defaults to None.",
+    )
+
+    # coordinate properties #
+    def _prep_coord(self, coord, opp_coord_name, op):
+        # coord must either be None or int
+        if coord is None:
+            return coord
+        assert isinstance(coord, int)
+        # try to get opposite coordinate, if it's not present, return
+        try:
+            opp_coord = getattr(self, opp_coord_name)
+        except AttributeError:
+            return coord
+        # if opposite coordinate is None, return
+        if opp_coord is None:
+            return coord
+        # otherwise compare it to coord ('>=' or '<=')
+        else:
+            assert op(coord, opp_coord)
+        return coord
+
+    def _hit_start_get(self):
+        """Get the sequence hit start coordinate (PRIVATE)."""
+        return self._hit_start
+
+    def _hit_start_set(self, value):
+        """Set the sequence hit start coordinate (PRIVATE)."""
+        self._hit_start = self._prep_coord(value, "hit_end", le)
+
+    hit_start = property(
+        fget=_hit_start_get,
+        fset=_hit_start_set,
+        doc="Hit sequence start coordinate, defaults to None.",
+    )
+
+    def _query_start_get(self):
+        """Get the query sequence start coordinate (PRIVATE)."""
+        return self._query_start
+
+    def _query_start_set(self, value):
+        """Set the query sequence start coordinate (PRIVATE)."""
+        self._query_start = self._prep_coord(value, "query_end", le)
+
+    query_start = property(
+        fget=_query_start_get,
+        fset=_query_start_set,
+        doc="Query sequence start coordinate, defaults to None.",
+    )
+
+    def _hit_end_get(self):
+        """Get the hit sequence end coordinate (PRIVATE)."""
+        return self._hit_end
+
+    def _hit_end_set(self, value):
+        """Set the hit sequence end coordinate (PRIVATE)."""
+        self._hit_end = self._prep_coord(value, "hit_start", ge)
+
+    hit_end = property(
+        fget=_hit_end_get,
+        fset=_hit_end_set,
+        doc="Hit sequence end coordinate, defaults to None.",
+    )
+
+    def _query_end_get(self):
+        """Get the query sequence end coordinate (PRIVATE)."""
+        return self._query_end
+
+    def _query_end_set(self, value):
+        """Set the query sequence end coordinate (PRIVATE)."""
+        self._query_end = self._prep_coord(value, "query_start", ge)
+
+    query_end = property(
+        fget=_query_end_get,
+        fset=_query_end_set,
+        doc="Query sequence end coordinate, defaults to None.",
+    )
+
+    # coordinate-dependent properties #
+    def _hit_span_get(self):
+        """Return the number of residues covered by the hit sequence (PRIVATE)."""
+        try:
+            return self.hit_end - self.hit_start
+        except TypeError:  # triggered if any of the coordinates are None
+            return None
+
+    hit_span = property(
+        fget=_hit_span_get, doc="The number of residues covered by the hit sequence."
+    )
+
+    def _query_span_get(self):
+        """Return the number or residues covered by the query (PRIVATE)."""
+        try:
+            return self.query_end - self.query_start
+        except TypeError:  # triggered if any of the coordinates are None
+            return None
+
+    query_span = property(
+        fget=_query_span_get,
+        doc="The number of residues covered by the query sequence.",
+    )
+
+    def _hit_range_get(self):
+        """Return the start and end of a hit (PRIVATE)."""
+        return (self.hit_start, self.hit_end)
+
+    hit_range = property(
+        fget=_hit_range_get, doc="Tuple of hit start and end coordinates."
+    )
+
+    def _query_range_get(self):
+        """Return the start and end of a query (PRIVATE)."""
+        return (self.query_start, self.query_end)
+
+    query_range = property(
+        fget=_query_range_get, doc="Tuple of query start and end coordinates."
+    )
+
+
+# if not used as a module, run the doctest
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SearchIO/_model/query.py b/code/lib/Bio/SearchIO/_model/query.py
new file mode 100644
index 0000000..f82cc5c
--- /dev/null
+++ b/code/lib/Bio/SearchIO/_model/query.py
@@ -0,0 +1,743 @@
+# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SearchIO object to model search results from a single query."""
+
+
+from copy import deepcopy
+from itertools import chain
+from collections import OrderedDict
+
+from Bio.SearchIO._utils import optionalcascade
+
+from ._base import _BaseSearchObject
+from .hit import Hit
+
+
+class QueryResult(_BaseSearchObject):
+    """Class representing search results from a single query.
+
+    QueryResult is the container object that stores all search hits from a
+    single search query. It is the top-level object returned by SearchIO's two
+    main functions, ``read`` and ``parse``. Depending on the search results and
+    search output format, a QueryResult object will contain zero or more Hit
+    objects (see Hit).
+
+    You can take a quick look at a QueryResult's contents and attributes by
+    invoking ``print`` on it::
+
+        >>> from Bio import SearchIO
+        >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+        >>> print(qresult)
+        Program: blastn (2.2.27+)
+          Query: 33211 (61)
+                 mir_1
+         Target: refseq_rna
+           Hits: ----  -----  ----------------------------------------------------------
+                    #  # HSP  ID + description
+                 ----  -----  ----------------------------------------------------------
+                    0      1  gi|262205317|ref|NR_030195.1|  Homo sapiens microRNA 52...
+                    1      1  gi|301171311|ref|NR_035856.1|  Pan troglodytes microRNA...
+                    2      1  gi|270133242|ref|NR_032573.1|  Macaca mulatta microRNA ...
+                    3      2  gi|301171322|ref|NR_035857.1|  Pan troglodytes microRNA...
+                    4      1  gi|301171267|ref|NR_035851.1|  Pan troglodytes microRNA...
+                    5      2  gi|262205330|ref|NR_030198.1|  Homo sapiens microRNA 52...
+                    6      1  gi|262205302|ref|NR_030191.1|  Homo sapiens microRNA 51...
+                    7      1  gi|301171259|ref|NR_035850.1|  Pan troglodytes microRNA...
+                    8      1  gi|262205451|ref|NR_030222.1|  Homo sapiens microRNA 51...
+                    9      2  gi|301171447|ref|NR_035871.1|  Pan troglodytes microRNA...
+                   10      1  gi|301171276|ref|NR_035852.1|  Pan troglodytes microRNA...
+                   11      1  gi|262205290|ref|NR_030188.1|  Homo sapiens microRNA 51...
+        ...
+
+    If you just want to know how many hits a QueryResult has, you can invoke
+    ``len`` on it. Alternatively, you can simply type its name in the interpreter::
+
+        >>> len(qresult)
+        100
+        >>> qresult
+        QueryResult(id='33211', 100 hits)
+
+    QueryResult behaves like a hybrid of Python's built-in list and dictionary.
+    You can retrieve its items (Hit objects) using the integer index of the
+    item, just like regular Python lists::
+
+        >>> first_hit = qresult[0]
+        >>> first_hit
+        Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps)
+
+    You can slice QueryResult objects as well. Slicing will return a new
+    QueryResult object containing only the sliced hits::
+
+        >>> sliced_qresult = qresult[:3]    # slice the first three hits
+        >>> len(qresult)
+        100
+        >>> len(sliced_qresult)
+        3
+        >>> print(sliced_qresult)
+        Program: blastn (2.2.27+)
+          Query: 33211 (61)
+                 mir_1
+         Target: refseq_rna
+           Hits: ----  -----  ----------------------------------------------------------
+                    #  # HSP  ID + description
+                 ----  -----  ----------------------------------------------------------
+                    0      1  gi|262205317|ref|NR_030195.1|  Homo sapiens microRNA 52...
+                    1      1  gi|301171311|ref|NR_035856.1|  Pan troglodytes microRNA...
+                    2      1  gi|270133242|ref|NR_032573.1|  Macaca mulatta microRNA ...
+
+    Like Python dictionaries, you can also retrieve hits using the hit's ID.
+    This is useful for retrieving hits that you know should exist in a given
+    search::
+
+        >>> hit = qresult['gi|262205317|ref|NR_030195.1|']
+        >>> hit
+        Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps)
+
+    You can also replace a Hit in QueryResult with another Hit using either the
+    integer index or hit key string. Note that the replacing object must be a
+    Hit that has the same ``query_id`` property as the QueryResult object.
+
+    If you're not sure whether a QueryResult contains a particular hit, you can
+    use the hit ID to check for membership first::
+
+        >>> 'gi|262205317|ref|NR_030195.1|' in qresult
+        True
+        >>> 'gi|262380031|ref|NR_023426.1|' in qresult
+        False
+
+    Or, if you just want to know the rank / position of a given hit, you can
+    use the hit ID as an argument for the ``index`` method. Note that the values
+    returned will be zero-based. So zero (0) means the hit is the first in the
+    QueryResult, three (3) means the hit is the fourth item, and so on. If the
+    hit does not exist in the QueryResult, a ``ValueError`` will be raised.
+
+        >>> qresult.index('gi|262205317|ref|NR_030195.1|')
+        0
+        >>> qresult.index('gi|262205330|ref|NR_030198.1|')
+        5
+        >>> qresult.index('gi|262380031|ref|NR_023426.1|')
+        Traceback (most recent call last):
+        ...
+        ValueError: ...
+
+    To ease working with a large number of hits, QueryResult has several
+    ``filter`` and ``map`` methods, analogous to Python's built-in functions with
+    the same names. There are ``filter`` and ``map`` methods available for
+    operations over both Hit objects or HSP objects. As an example, here we are
+    using the ``hit_map`` method to rename all hit IDs within a QueryResult::
+
+        >>> def renamer(hit):
+        ...     hit.id = hit.id.split('|')[3]
+        ...     return hit
+        >>> mapped_qresult = qresult.hit_map(renamer)
+        >>> print(mapped_qresult)
+        Program: blastn (2.2.27+)
+          Query: 33211 (61)
+                 mir_1
+         Target: refseq_rna
+           Hits: ----  -----  ----------------------------------------------------------
+                    #  # HSP  ID + description
+                 ----  -----  ----------------------------------------------------------
+                    0      1  NR_030195.1  Homo sapiens microRNA 520b (MIR520B), micr...
+                    1      1  NR_035856.1  Pan troglodytes microRNA mir-520b (MIR520B...
+                    2      1  NR_032573.1  Macaca mulatta microRNA mir-519a (MIR519A)...
+        ...
+
+    The principle for other ``map`` and ``filter`` methods are similar: they accept
+    a function, applies it, and returns a new QueryResult object.
+
+    There are also other methods useful for working with list-like objects:
+    ``append``, ``pop``, and ``sort``. More details and examples are available in
+    their respective documentations.
+
+    Finally, just like Python lists and dictionaries, QueryResult objects are
+    iterable. Iteration over QueryResults will yield Hit objects::
+
+        >>> for hit in qresult[:4]:     # iterate over the first four items
+        ...     hit
+        ...
+        Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps)
+        Hit(id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 hsps)
+        Hit(id='gi|270133242|ref|NR_032573.1|', query_id='33211', 1 hsps)
+        Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 2 hsps)
+
+    If you need access to all the hits in a QueryResult object, you can get
+    them in a list using the ``hits`` property. Similarly, access to all hit IDs is
+    available through the ``hit_keys`` property.
+
+        >>> qresult.hits
+        [Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps), ...]
+        >>> qresult.hit_keys
+        ['gi|262205317|ref|NR_030195.1|', 'gi|301171311|ref|NR_035856.1|', ...]
+
+    """
+
+    # attributes we don't want to transfer when creating a new QueryResult class
+    # from this one
+    _NON_STICKY_ATTRS = ("_items", "__alt_hit_ids")
+
+    def __init__(self, hits=(), id=None, hit_key_function=None):
+        """Initialize a QueryResult object.
+
+        :param id: query sequence ID
+        :type id: string
+        :param hits: iterator yielding Hit objects
+        :type hits: iterable
+        :param hit_key_function: function to define hit keys
+        :type hit_key_function: callable, accepts Hit objects, returns string
+
+        """
+        # default values
+        self._id = id
+        self._hit_key_function = hit_key_function or _hit_key_func
+        self._items = OrderedDict()
+        self._description = None
+        self.__alt_hit_ids = {}
+        self.program = ""
+        self.target = ""
+        self.version = ""
+
+        # validate Hit objects and fill up self._items
+        for hit in hits:
+            # validation is handled by __setitem__
+            self.append(hit)
+
+    def __iter__(self):
+        """Iterate over hits."""
+        return iter(self.hits)
+
+    @property
+    def hits(self):
+        """Hit objects contained in the QueryResult."""
+        return list(self._items.values())
+
+    @property
+    def hit_keys(self):
+        """Hit IDs of the Hit objects contained in the QueryResult."""
+        return list(self._items.keys())
+
+    @property
+    def items(self):
+        """List of tuples of Hit IDs and Hit objects."""
+        return list(self._items.items())
+
+    def iterhits(self):
+        """Return an iterator over the Hit objects."""
+        yield from self._items.values()
+
+    def iterhit_keys(self):
+        """Return an iterator over the ID of the Hit objects."""
+        yield from self._items
+
+    def iteritems(self):
+        """Return an iterator yielding tuples of Hit ID and Hit objects."""
+        yield from self._items.items()
+
+    def __contains__(self, hit_key):
+        """Return True if hit key in items or alternative hit identifiers."""
+        if isinstance(hit_key, Hit):
+            return self._hit_key_function(hit_key) in self._items
+        return hit_key in self._items or hit_key in self.__alt_hit_ids
+
+    def __len__(self):
+        """Return the number of items."""
+        return len(self._items)
+
+    def __bool__(self):
+        """Return True if there are items."""
+        return bool(self._items)
+
+    def __repr__(self):
+        """Return string representation of the QueryResult object."""
+        return "QueryResult(id=%r, %r hits)" % (self.id, len(self))
+
+    def __str__(self):
+        """Return a human readable summary of the QueryResult object."""
+        lines = []
+
+        # set program and version line
+        lines.append("Program: %s (%s)" % (self.program, self.version))
+
+        # set query id line
+        qid_line = "  Query: %s" % self.id
+        try:
+            seq_len = self.seq_len
+        except AttributeError:
+            pass
+        else:
+            qid_line += " (%i)" % seq_len
+        lines.append(qid_line)
+        if self.description:
+            line = "         %s" % self.description
+            line = line[:77] + "..." if len(line) > 80 else line
+            lines.append(line)
+
+        # set target line
+        lines.append(" Target: %s" % self.target)
+
+        # set hit lines
+        if not self.hits:
+            lines.append("   Hits: 0")
+        else:
+            lines.append("   Hits: %s  %s  %s" % ("-" * 4, "-" * 5, "-" * 58))
+            pattern = "%13s  %5s  %s"
+            lines.append(pattern % ("#", "# HSP", "ID + description"))
+            lines.append(pattern % ("-" * 4, "-" * 5, "-" * 58))
+            for idx, hit in enumerate(self.hits):
+                if idx < 30:
+                    hid_line = "%s  %s" % (hit.id, hit.description)
+                    if len(hid_line) > 58:
+                        hid_line = hid_line[:55] + "..."
+                    lines.append(pattern % (idx, len(hit), hid_line))
+                elif idx > len(self.hits) - 4:
+                    hid_line = "%s  %s" % (hit.id, hit.description)
+                    if len(hid_line) > 58:
+                        hid_line = hid_line[:55] + "..."
+                    lines.append(pattern % (idx, len(hit), hid_line))
+                elif idx == 30:
+                    lines.append("%14s" % "~~~")
+
+        return "\n".join(lines)
+
+    def __getitem__(self, hit_key):
+        """Return a QueryResult object that matches the hit_key."""
+        # retrieval using slice objects returns another QueryResult object
+        if isinstance(hit_key, slice):
+            # should we return just a list of Hits instead of a full blown
+            # QueryResult object if it's a slice?
+            hits = list(self.hits)[hit_key]
+            obj = self.__class__(hits, self.id, self._hit_key_function)
+            self._transfer_attrs(obj)
+            return obj
+
+        # if key is an int, then retrieve the Hit at the int index
+        elif isinstance(hit_key, int):
+            length = len(self)
+            if 0 <= hit_key < length:
+                for idx, item in enumerate(self.iterhits()):
+                    if idx == hit_key:
+                        return item
+            elif -1 * length <= hit_key < 0:
+                for idx, item in enumerate(self.iterhits()):
+                    if length + hit_key == idx:
+                        return item
+            raise IndexError("list index out of range")
+
+        # if key is a string, then do a regular dictionary retrieval
+        # falling back on alternative hit IDs
+        try:
+            return self._items[hit_key]
+        except KeyError:
+            return self._items[self.__alt_hit_ids[hit_key]]
+
+    def __setitem__(self, hit_key, hit):
+        """Add an item of key hit_key and value hit."""
+        # only accept string keys
+        if not isinstance(hit_key, str):
+            raise TypeError("QueryResult object keys must be a string.")
+        # hit must be a Hit object
+        if not isinstance(hit, Hit):
+            raise TypeError("QueryResult objects can only contain Hit objects.")
+        qid = self.id
+        hqid = hit.query_id
+        # and it must have the same query ID as this object's ID
+        # unless it's the query ID is None (default for empty objects), in which
+        # case we want to use the hit's query ID as the query ID
+        if qid is not None:
+            if hqid != qid:
+                raise ValueError(
+                    "Expected Hit with query ID %r, found %r instead." % (qid, hqid)
+                )
+        else:
+            self.id = hqid
+        # same thing with descriptions
+        qdesc = self.description
+        hqdesc = hit.query_description
+        if qdesc is not None:
+            if hqdesc != qdesc:
+                raise ValueError(
+                    "Expected Hit with query description %r, found %r instead."
+                    % (qdesc, hqdesc)
+                )
+        else:
+            self.description = hqdesc
+
+        # remove existing alt_id references, if hit_key already exists
+        if hit_key in self._items:
+            for alt_key in self._items[hit_key].id_all[1:]:
+                del self.__alt_hit_ids[alt_key]
+
+        # if hit_key is already present as an alternative ID
+        # delete it from the alternative ID dict
+        if hit_key in self.__alt_hit_ids:
+            del self.__alt_hit_ids[hit_key]
+
+        self._items[hit_key] = hit
+        for alt_id in hit.id_all[1:]:
+            self.__alt_hit_ids[alt_id] = hit_key
+
+    def __delitem__(self, hit_key):
+        """Delete item of key hit_key."""
+        # if hit_key an integer or slice, get the corresponding key first
+        # and put it into a list
+        if isinstance(hit_key, int):
+            hit_keys = [list(self.hit_keys)[hit_key]]
+        # the same, if it's a slice
+        elif isinstance(hit_key, slice):
+            hit_keys = list(self.hit_keys)[hit_key]
+        # otherwise put it in a list
+        else:
+            hit_keys = [hit_key]
+
+        for key in hit_keys:
+            deleted = False
+            if key in self._items:
+                del self._items[key]
+                deleted = True
+            if key in self.__alt_hit_ids:
+                del self._items[self.__alt_hit_ids[key]]
+                del self.__alt_hit_ids[key]
+                deleted = True
+            if not deleted:
+                raise KeyError(repr(key))
+
+    # properties #
+    id = optionalcascade("_id", "query_id", """QueryResult ID string""")
+    description = optionalcascade(
+        "_description", "query_description", """QueryResult description"""
+    )
+
+    @property
+    def hsps(self):
+        """Access the HSP objects contained in the QueryResult."""
+        return sorted(
+            (hsp for hsp in chain(*self.hits)), key=lambda hsp: hsp.output_index
+        )
+
+    @property
+    def fragments(self):
+        """Access the HSPFragment objects contained in the QueryResult."""
+        return list(chain(*self.hsps))
+
+    # public methods #
+    def absorb(self, hit):
+        """Add a Hit object to the end of QueryResult.
+
+        If the QueryResult already has a Hit with the same ID, append the new
+        Hit's HSPs into the existing Hit.
+
+        :param hit: object to absorb
+        :type hit: Hit
+
+        This method is used for file formats that may output the same Hit in
+        separate places, such as BLAT or Exonerate. In both formats, Hit
+        with different strands are put in different places. However, SearchIO
+        considers them to be the same as a Hit object should be all database
+        entries with the same ID, regardless of strand orientation.
+
+        """
+        try:
+            self.append(hit)
+        except ValueError:
+            assert hit.id in self
+            for hsp in hit:
+                self[hit.id].append(hsp)
+
+    def append(self, hit):
+        """Add a Hit object to the end of QueryResult.
+
+        :param hit: object to append
+        :type hit: Hit
+
+        Any Hit object appended must have the same ``query_id`` property as the
+        QueryResult's ``id`` property. If the hit key already exists, a
+        ``ValueError`` will be raised.
+
+        """
+        # if a custom hit_key_function is supplied, use it to define th hit key
+        if self._hit_key_function is not None:
+            hit_key = self._hit_key_function(hit)
+        else:
+            hit_key = hit.id
+
+        if hit_key not in self and all(pid not in self for pid in hit.id_all[1:]):
+            self[hit_key] = hit
+        else:
+            raise ValueError(
+                "The ID or alternative IDs of Hit %r exists in this QueryResult."
+                % hit_key
+            )
+
+    def hit_filter(self, func=None):
+        """Create new QueryResult object whose Hit objects pass the filter function.
+
+        :param func: filter function
+        :type func: callable, accepts Hit, returns bool
+
+        Here is an example of using ``hit_filter`` to select Hits whose
+        description begins with the string 'Homo sapiens', case sensitive::
+
+            >>> from Bio import SearchIO
+            >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+            >>> def desc_filter(hit):
+            ...     return hit.description.startswith('Homo sapiens')
+            ...
+            >>> len(qresult)
+            100
+            >>> filtered = qresult.hit_filter(desc_filter)
+            >>> len(filtered)
+            39
+            >>> print(filtered[:4])
+            Program: blastn (2.2.27+)
+              Query: 33211 (61)
+                     mir_1
+             Target: refseq_rna
+               Hits: ----  -----  ----------------------------------------------------------
+                        #  # HSP  ID + description
+                     ----  -----  ----------------------------------------------------------
+                        0      1  gi|262205317|ref|NR_030195.1|  Homo sapiens microRNA 52...
+                        1      2  gi|262205330|ref|NR_030198.1|  Homo sapiens microRNA 52...
+                        2      1  gi|262205302|ref|NR_030191.1|  Homo sapiens microRNA 51...
+                        3      1  gi|262205451|ref|NR_030222.1|  Homo sapiens microRNA 51...
+
+        Note that instance attributes (other than the hits) from the unfiltered
+        QueryResult are retained in the filtered object.
+
+            >>> qresult.program == filtered.program
+            True
+            >>> qresult.target == filtered.target
+            True
+
+        """
+        hits = list(filter(func, self.hits))
+        obj = self.__class__(hits, self.id, self._hit_key_function)
+        self._transfer_attrs(obj)
+        return obj
+
+    def hit_map(self, func=None):
+        """Create new QueryResult object, mapping the given function to its Hits.
+
+        :param func: map function
+        :type func: callable, accepts Hit, returns Hit
+
+        Here is an example of using ``hit_map`` with a function that discards all
+        HSPs in a Hit except for the first one::
+
+            >>> from Bio import SearchIO
+            >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+            >>> print(qresult[:8])
+            Program: blastn (2.2.27+)
+              Query: 33211 (61)
+                     mir_1
+             Target: refseq_rna
+               Hits: ----  -----  ----------------------------------------------------------
+                        #  # HSP  ID + description
+                     ----  -----  ----------------------------------------------------------
+                        0      1  gi|262205317|ref|NR_030195.1|  Homo sapiens microRNA 52...
+                        1      1  gi|301171311|ref|NR_035856.1|  Pan troglodytes microRNA...
+                        2      1  gi|270133242|ref|NR_032573.1|  Macaca mulatta microRNA ...
+                        3      2  gi|301171322|ref|NR_035857.1|  Pan troglodytes microRNA...
+                        4      1  gi|301171267|ref|NR_035851.1|  Pan troglodytes microRNA...
+                        5      2  gi|262205330|ref|NR_030198.1|  Homo sapiens microRNA 52...
+                        6      1  gi|262205302|ref|NR_030191.1|  Homo sapiens microRNA 51...
+                        7      1  gi|301171259|ref|NR_035850.1|  Pan troglodytes microRNA...
+
+            >>> top_hsp = lambda hit: hit[:1]
+            >>> mapped_qresult = qresult.hit_map(top_hsp)
+            >>> print(mapped_qresult[:8])
+            Program: blastn (2.2.27+)
+              Query: 33211 (61)
+                     mir_1
+             Target: refseq_rna
+               Hits: ----  -----  ----------------------------------------------------------
+                        #  # HSP  ID + description
+                     ----  -----  ----------------------------------------------------------
+                        0      1  gi|262205317|ref|NR_030195.1|  Homo sapiens microRNA 52...
+                        1      1  gi|301171311|ref|NR_035856.1|  Pan troglodytes microRNA...
+                        2      1  gi|270133242|ref|NR_032573.1|  Macaca mulatta microRNA ...
+                        3      1  gi|301171322|ref|NR_035857.1|  Pan troglodytes microRNA...
+                        4      1  gi|301171267|ref|NR_035851.1|  Pan troglodytes microRNA...
+                        5      1  gi|262205330|ref|NR_030198.1|  Homo sapiens microRNA 52...
+                        6      1  gi|262205302|ref|NR_030191.1|  Homo sapiens microRNA 51...
+                        7      1  gi|301171259|ref|NR_035850.1|  Pan troglodytes microRNA...
+
+        """
+        hits = [deepcopy(hit) for hit in self.hits]
+        if func is not None:
+            hits = [func(x) for x in hits]
+        obj = self.__class__(hits, self.id, self._hit_key_function)
+        self._transfer_attrs(obj)
+        return obj
+
+    def hsp_filter(self, func=None):
+        """Create new QueryResult object whose HSP objects pass the filter function.
+
+        ``hsp_filter`` is the same as ``hit_filter``, except that it filters
+        directly on each HSP object in every Hit. If the filtering removes
+        all HSP objects in a given Hit, the entire Hit will be discarded. This
+        will result in the QueryResult having less Hit after filtering.
+        """
+        hits = [x for x in (hit.filter(func) for hit in self.hits) if x]
+        obj = self.__class__(hits, self.id, self._hit_key_function)
+        self._transfer_attrs(obj)
+        return obj
+
+    def hsp_map(self, func=None):
+        """Create new QueryResult object, mapping the given function to its HSPs.
+
+        ``hsp_map`` is the same as ``hit_map``, except that it applies the given
+        function to all HSP objects in every Hit, instead of the Hit objects.
+        """
+        hits = [x for x in (hit.map(func) for hit in list(self.hits)[:]) if x]
+        obj = self.__class__(hits, self.id, self._hit_key_function)
+        self._transfer_attrs(obj)
+        return obj
+
+    # marker for default self.pop() return value
+    # this method is adapted from Python's built in OrderedDict.pop
+    # implementation
+    __marker = object()
+
+    def pop(self, hit_key=-1, default=__marker):
+        """Remove the specified hit key and return the Hit object.
+
+        :param hit_key: key of the Hit object to return
+        :type hit_key: int or string
+        :param default: return value if no Hit exists with the given key
+        :type default: object
+
+        By default, ``pop`` will remove and return the last Hit object in the
+        QueryResult object. To remove specific Hit objects, you can use its
+        integer index or hit key.
+
+            >>> from Bio import SearchIO
+            >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+            >>> len(qresult)
+            100
+            >>> for hit in qresult[:5]:
+            ...     print(hit.id)
+            ...
+            gi|262205317|ref|NR_030195.1|
+            gi|301171311|ref|NR_035856.1|
+            gi|270133242|ref|NR_032573.1|
+            gi|301171322|ref|NR_035857.1|
+            gi|301171267|ref|NR_035851.1|
+
+            # remove the last hit
+            >>> qresult.pop()
+            Hit(id='gi|397513516|ref|XM_003827011.1|', query_id='33211', 1 hsps)
+
+            # remove the first hit
+            >>> qresult.pop(0)
+            Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps)
+
+            # remove hit with the given ID
+            >>> qresult.pop('gi|301171322|ref|NR_035857.1|')
+            Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 2 hsps)
+
+        """
+        # if key is an integer (index)
+        # get the ID for the Hit object at that index
+        if isinstance(hit_key, int):
+            # raise the appropriate error if there is no hit
+            if not self:
+                raise IndexError("pop from empty list")
+            hit_key = list(self.hit_keys)[hit_key]
+
+        try:
+            hit = self._items.pop(hit_key)
+            # remove all alternative IDs of the popped hit
+            for alt_id in hit.id_all[1:]:
+                try:
+                    del self.__alt_hit_ids[alt_id]
+                except KeyError:
+                    pass
+            return hit
+        except KeyError:
+            if hit_key in self.__alt_hit_ids:
+                return self.pop(self.__alt_hit_ids[hit_key], default)
+            # if key doesn't exist and no default is set, raise a KeyError
+            if default is self.__marker:
+                raise KeyError(hit_key) from None
+        # if key doesn't exist but a default is set, return the default value
+        return default
+
+    def index(self, hit_key):
+        """Return the index of a given hit key, zero-based.
+
+        :param hit_key: hit ID
+        :type hit_key: string
+
+        This method is useful for finding out the integer index (usually
+        correlated with search rank) of a given hit key.
+
+            >>> from Bio import SearchIO
+            >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
+            >>> qresult.index('gi|301171259|ref|NR_035850.1|')
+            7
+
+        """
+        if isinstance(hit_key, Hit):
+            return list(self.hit_keys).index(hit_key.id)
+        try:
+            return list(self.hit_keys).index(hit_key)
+        except ValueError:
+            if hit_key in self.__alt_hit_ids:
+                return self.index(self.__alt_hit_ids[hit_key])
+            raise
+
+    def sort(self, key=None, reverse=False, in_place=True):
+        """Sort the Hit objects.
+
+        :param key: sorting function
+        :type key: callable, accepts Hit, returns key for sorting
+        :param reverse: whether to reverse sorting results or no
+        :type reverse: bool
+        :param in_place: whether to do in-place sorting or no
+        :type in_place: bool
+
+        ``sort`` defaults to sorting in-place, to mimick Python's ``list.sort``
+        method. If you set the ``in_place`` argument to False, it will treat
+        return a new, sorted QueryResult object and keep the initial one
+        unsorted.
+
+        """
+        if key is None:
+            # if reverse is True, reverse the hits
+            if reverse:
+                sorted_hits = list(self.hits)[::-1]
+            # otherwise (default options) make a copy of the hits
+            else:
+                sorted_hits = list(self.hits)[:]
+        else:
+            sorted_hits = sorted(self.hits, key=key, reverse=reverse)
+
+        # if sorting is in-place, don't create a new QueryResult object
+        if in_place:
+            new_hits = OrderedDict()
+            for hit in sorted_hits:
+                new_hits[self._hit_key_function(hit)] = hit
+            self._items = new_hits
+        # otherwise, return a new sorted QueryResult object
+        else:
+            obj = self.__class__(sorted_hits, self.id, self._hit_key_function)
+            self._transfer_attrs(obj)
+            return obj
+
+
+def _hit_key_func(hit):
+    """Map hit to its identifier (PRIVATE).
+
+    Default hit key function for QueryResult.__init__ use.
+    """
+    return hit.id
+
+
+# if not used as a module, run the doctest
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SearchIO/_utils.py b/code/lib/Bio/SearchIO/_utils.py
new file mode 100644
index 0000000..3d801ab
--- /dev/null
+++ b/code/lib/Bio/SearchIO/_utils.py
@@ -0,0 +1,167 @@
+# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Common SearchIO utility functions."""
+
+
+def getattr_str(obj, attr, fmt=None, fallback="?"):
+    """Return string of the given object's attribute.
+
+    Defaults to the given fallback value if attribute is not present.
+    """
+    try:
+        value = getattr(obj, attr)
+    except AttributeError:
+        return fallback
+    if fmt is None:
+        return str(value)
+    return fmt % value
+
+
+def read_forward(handle):
+    """Read through whitespaces, return the first non-whitespace line."""
+    while True:
+        line = handle.readline()
+        # if line is empty or line has characters and stripping does not remove
+        # them, return the line
+        if (not line) or (line and line.strip()):
+            return line
+
+
+def get_processor(format, mapping):
+    """Return the object to process the given format according to the mapping.
+
+    :param format: format name
+    :type format: string, lower case
+    :param mapping: mapping of format name and its processor object
+    :type mapping: dictionary {string: object}
+
+    """
+    # map file format to iterator name
+    try:
+        obj_info = mapping[format]
+    except KeyError:
+        # handle the errors with helpful messages
+        if format is None:
+            raise ValueError("Format required (lower case string)") from None
+        elif not isinstance(format, str):
+            raise TypeError("Need a string for the file format (lower case)") from None
+        elif format != format.lower():
+            raise ValueError("Format string %r should be lower case" % format) from None
+        else:
+            raise ValueError(
+                "Unknown format %r. Supported formats are %r"
+                % (format, "', '".join(mapping))
+            ) from None
+
+    mod_name, obj_name = obj_info
+    mod = __import__("Bio.SearchIO.%s" % mod_name, fromlist=[""])
+
+    return getattr(mod, obj_name)
+
+
+def singleitem(attr=None, doc=""):
+    """Property for fetching attribute from first entry of container.
+
+    Returns a property that fetches the given attribute from
+    the first item in a SearchIO container object.
+    """
+
+    def getter(self):
+        if len(self._items) > 1:
+            raise ValueError("More than one HSPFragment objects found in HSP")
+        if attr is None:
+            return self._items[0]
+        return getattr(self._items[0], attr)
+
+    return property(fget=getter, doc=doc)
+
+
+def allitems(attr=None, doc=""):
+    """Property for fetching attribute from all entries of container.
+
+    Returns a property that fetches the given attributes from
+    all items in a SearchIO container object.
+    """
+
+    def getter(self):
+        if attr is None:
+            return self._items
+        return [getattr(frag, attr) for frag in self._items]
+
+    return property(fget=getter, doc=doc)
+
+
+def fullcascade(attr, doc=""):
+    """Return a getter property with a cascading setter.
+
+    This is similar to ``optionalcascade``, but for SearchIO containers that have
+    at least one item (HSP). The getter always retrieves the attribute
+    value from the first item. If the items have more than one attribute values,
+    an error will be raised. The setter behaves like ``partialcascade``, except
+    that it only sets attributes to items in the object, not the object itself.
+
+    """
+
+    def getter(self):
+        return getattr(self._items[0], attr)
+
+    def setter(self, value):
+        for item in self:
+            setattr(item, attr, value)
+
+    return property(fget=getter, fset=setter, doc=doc)
+
+
+def optionalcascade(cont_attr, item_attr, doc=""):
+    """Return a getter property with a cascading setter.
+
+    This is used for the ``id`` and ``description`` properties of the container
+    objects with zero or more items. These items have their own private
+    attributes that stores query and/or hit ID and description. When the
+    container has zero items, attribute values are always retrieved from the
+    container's attribute. Otherwise, the first item's attribute is used.
+
+    To keep the container items' query and/or hit ID and description in-sync,
+    the setter cascades any new value given to the items' values.
+
+    """
+
+    def getter(self):
+        if self._items:
+            # don't use self._items here, so QueryResult can use this property
+            # as well (the underlying OrderedDict is not integer-indexable)
+            return getattr(self[0], item_attr)
+        else:
+            return getattr(self, cont_attr)
+
+    def setter(self, value):
+        setattr(self, cont_attr, value)
+        for item in self:
+            setattr(item, item_attr, value)
+
+    return property(fget=getter, fset=setter, doc=doc)
+
+
+def fragcascade(attr, seq_type, doc=""):
+    """Return a getter property with cascading setter, for HSPFragment objects.
+
+    Similar to ``partialcascade``, but for HSPFragment objects and acts on ``query``
+    or ``hit`` properties of the object if they are not None.
+
+    """
+    assert seq_type in ("hit", "query")
+    attr_name = "_%s_%s" % (seq_type, attr)
+
+    def getter(self):
+        return getattr(self, attr_name)
+
+    def setter(self, value):
+        setattr(self, attr_name, value)
+        seq = getattr(self, seq_type)
+        if seq is not None:
+            setattr(seq, attr, value)
+
+    return property(fget=getter, fset=setter, doc=doc)
diff --git a/code/lib/Bio/Seq.py b/code/lib/Bio/Seq.py
new file mode 100644
index 0000000..2b47b5a
--- /dev/null
+++ b/code/lib/Bio/Seq.py
@@ -0,0 +1,3223 @@
+# Copyright 2000 Andrew Dalke.
+# Copyright 2000-2002 Brad Chapman.
+# Copyright 2004-2005, 2010 by M de Hoon.
+# Copyright 2007-2020 by Peter Cock.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Provide objects to represent biological sequences.
+
+See also the Seq_ wiki and the chapter in our tutorial:
+ - `HTML Tutorial`_
+ - `PDF Tutorial`_
+
+.. _Seq: http://biopython.org/wiki/Seq
+.. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
+.. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
+
+"""
+import array
+import warnings
+
+from abc import ABC
+from abc import abstractmethod
+
+from Bio import BiopythonDeprecationWarning
+from Bio import BiopythonWarning
+from Bio.Data import CodonTable
+from Bio.Data import IUPACData
+
+
+def _maketrans(complement_mapping):
+    """Make a python string translation table (PRIVATE).
+
+    Arguments:
+     - complement_mapping - a dictionary such as ambiguous_dna_complement
+       and ambiguous_rna_complement from Data.IUPACData.
+
+    Returns a translation table (a string of length 256) for use with the
+    python string's translate method to use in a (reverse) complement.
+
+    Compatible with lower case and upper case sequences.
+
+    For internal use only.
+    """
+    keys = "".join(complement_mapping.keys()).encode("ASCII")
+    values = "".join(complement_mapping.values()).encode("ASCII")
+    return bytes.maketrans(keys + keys.lower(), values + values.lower())
+
+
+_dna_complement_table = _maketrans(IUPACData.ambiguous_dna_complement)
+ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
+ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
+_rna_complement_table = _maketrans(ambiguous_rna_complement)
+del ambiguous_rna_complement
+
+
+class SequenceDataAbstractBaseClass(ABC):
+    """Abstract base class for sequence content providers.
+
+    Most users will not need to use this class. It is used internally as a base
+    class for sequence content provider classes such as _UndefinedSequenceData
+    defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
+    Instances of these classes can be used instead of a ``bytes`` object as the
+    data argument when creating a Seq object, and provide the sequence content
+    only when requested via ``__getitem__``. This allows lazy parsers to load
+    and parse sequence data from a file only for the requested sequence regions,
+    and _UndefinedSequenceData instances to raise an exception when undefined
+    sequence data are requested.
+
+    Future implementations of lazy parsers that similarly provide on-demand
+    parsing of sequence data should use a subclass of this abstract class and
+    implement the abstract methods ``__len__`` and ``__getitem__``:
+
+    * ``__len__`` must return the sequence length;
+    * ``__getitem__`` must return
+
+      * a ``bytes`` object for the requested region; or
+      * a new instance of the subclass for the requested region; or
+      * raise an ``UndefinedSequenceError``.
+
+      Calling ``__getitem__`` for a sequence region of size zero should always
+      return an empty ``bytes`` object.
+      Calling ``__getitem__`` for the full sequence (as in data[:]) should
+      either return a ``bytes`` object with the full sequence, or raise an
+      ``UndefinedSequenceError``.
+
+    Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
+    as part of their ``__init__`` method.
+    """
+
+    __slots__ = ()
+
+    def __init__(self):
+        """Check if ``__getitem__`` returns a bytes-like object."""
+        assert self[:0] == b""
+
+    @abstractmethod
+    def __len__(self):
+        pass
+
+    @abstractmethod
+    def __getitem__(self, key):
+        pass
+
+    def __bytes__(self):
+        return self[:]
+
+    def __hash__(self):
+        return hash(bytes(self))
+
+    def __eq__(self, other):
+        return bytes(self) == other
+
+    def __lt__(self, other):
+        return bytes(self) < other
+
+    def __le__(self, other):
+        return bytes(self) <= other
+
+    def __gt__(self, other):
+        return bytes(self) > other
+
+    def __ge__(self, other):
+        return bytes(self) >= other
+
+    def __add__(self, other):
+        return bytes(self) + other
+
+    def __radd__(self, other):
+        return other + bytes(self)
+
+    def __mul__(self, other):
+        return bytes(self) * other
+
+    def __contains__(self, item):
+        return bytes(self).__contains__(item)
+
+    def decode(self, encoding="utf-8"):
+        """Decode the data as bytes using the codec registered for encoding.
+
+        encoding
+          The encoding with which to decode the bytes.
+        """
+        return bytes(self).decode(encoding)
+
+    def count(self, sub, start=None, end=None):
+        """Return the number of non-overlapping occurrences of sub in data[start:end].
+
+        Optional arguments start and end are interpreted as in slice notation.
+        """
+        return bytes(self).count(sub, start, end)
+
+    def find(self, sub, start=None, end=None):
+        """Return the lowest index in data where subsection sub is found.
+
+        Return the lowest index in data where subsection sub is found,
+        such that sub is contained within data[start,end].  Optional
+        arguments start and end are interpreted as in slice notation.
+
+        Return -1 on failure.
+        """
+        return bytes(self).find(sub, start, end)
+
+    def rfind(self, sub, start=None, end=None):
+        """Return the highest index in data where subsection sub is found.
+
+        Return the highest index in data where subsection sub is found,
+        such that sub is contained within data[start,end].  Optional
+        arguments start and end are interpreted as in slice notation.
+
+        Return -1 on failure.
+        """
+        return bytes(self).rfind(sub, start, end)
+
+    def index(self, sub, start=None, end=None):
+        """Return the lowest index in data where subsection sub is found.
+
+        Return the lowest index in data where subsection sub is found,
+        such that sub is contained within data[start,end].  Optional
+        arguments start and end are interpreted as in slice notation.
+
+        Raises ValueError when the subsection is not found.
+        """
+        return bytes(self).index(sub, start, end)
+
+    def rindex(self, sub, start=None, end=None):
+        """Return the highest index in data where subsection sub is found.
+
+        Return the highest index in data where subsection sub is found,
+        such that sub is contained within data[start,end].  Optional
+        arguments start and end are interpreted as in slice notation.
+
+        Raise ValueError when the subsection is not found.
+        """
+        return bytes(self).rindex(sub, start, end)
+
+    def startswith(self, prefix, start=None, end=None):
+        """Return True if data starts with the specified prefix, False otherwise.
+
+        With optional start, test data beginning at that position.
+        With optional end, stop comparing data at that position.
+        prefix can also be a tuple of bytes to try.
+        """
+        return bytes(self).startswith(prefix, start, end)
+
+    def endswith(self, suffix, start=None, end=None):
+        """Return True if data ends with the specified suffix, False otherwise.
+
+        With optional start, test data beginning at that position.
+        With optional end, stop comparing data at that position.
+        suffix can also be a tuple of bytes to try.
+        """
+        return bytes(self).endswith(suffix, start, end)
+
+    def split(self, sep=None, maxsplit=-1):
+        """Return a list of the sections in the data, using sep as the delimiter.
+
+        sep
+          The delimiter according which to split the data.
+          None (the default value) means split on ASCII whitespace characters
+          (space, tab, return, newline, formfeed, vertical tab).
+        maxsplit
+          Maximum number of splits to do.
+          -1 (the default value) means no limit.
+        """
+        return bytes(self).split(sep, maxsplit)
+
+    def rsplit(self, sep=None, maxsplit=-1):
+        """Return a list of the sections in the data, using sep as the delimiter.
+
+        sep
+          The delimiter according which to split the data.
+          None (the default value) means split on ASCII whitespace characters
+          (space, tab, return, newline, formfeed, vertical tab).
+        maxsplit
+          Maximum number of splits to do.
+          -1 (the default value) means no limit.
+
+        Splitting is done starting at the end of the data and working to the front.
+        """
+        return bytes(self).rsplit(sep, maxsplit)
+
+    def strip(self, chars=None):
+        """Strip leading and trailing characters contained in the argument.
+
+        If the argument is omitted or None, strip leading and trailing ASCII whitespace.
+        """
+        return bytes(self).strip(chars)
+
+    def lstrip(self, chars=None):
+        """Strip leading characters contained in the argument.
+
+        If the argument is omitted or None, strip leading ASCII whitespace.
+        """
+        return bytes(self).lstrip(chars)
+
+    def rstrip(self, chars=None):
+        """Strip trailing characters contained in the argument.
+
+        If the argument is omitted or None, strip trailing ASCII whitespace.
+        """
+        return bytes(self).rstrip(chars)
+
+    def upper(self):
+        """Return a copy of data with all ASCII characters converted to uppercase."""
+        return bytes(self).upper()
+
+    def lower(self):
+        """Return a copy of data with all ASCII characters converted to lowercase."""
+        return bytes(self).lower()
+
+    def replace(self, old, new):
+        """Return a copy with all occurrences of substring old replaced by new."""
+        return bytes(self).replace(old, new)
+
+    def translate(self, table, delete=b""):
+        """Return a copy with each character mapped by the given translation table.
+
+          table
+            Translation table, which must be a bytes object of length 256.
+
+        All characters occurring in the optional argument delete are removed.
+        The remaining characters are mapped through the given translation table.
+        """
+        return bytes(self).translate(table)
+
+
+class _SeqAbstractBaseClass(ABC):
+    """Abstract base class for the Seq and MutableSeq classes (PRIVATE).
+
+    Most users will not need to use this class. It is used internally as an
+    abstract base class for Seq and MutableSeq, as most of their methods are
+    identical.
+    """
+
+    __slots__ = ("_data",)
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    def __bytes__(self):
+        return bytes(self._data)
+
+    def __repr__(self):
+        """Return (truncated) representation of the sequence."""
+        data = self._data
+        if isinstance(data, _UndefinedSequenceData):
+            return f"Seq(None, length={len(self)})"
+        if len(data) > 60:
+            # Shows the last three letters as it is often useful to see if
+            # there is a stop codon at the end of a sequence.
+            # Note total length is 54+3+3=60
+            start = data[:54].decode("ASCII")
+            end = data[-3:].decode("ASCII")
+            return f"{self.__class__.__name__}('{start}...{end}')"
+        else:
+            data = data.decode("ASCII")
+            return f"{self.__class__.__name__}('{data}')"
+
+    def __str__(self):
+        """Return the full sequence as a python string."""
+        return self._data.decode("ASCII")
+
+    def __eq__(self, other):
+        """Compare the sequence to another sequence or a string.
+
+        Sequences are equal to each other if their sequence contents is
+        identical:
+
+        >>> from Bio.Seq import Seq, MutableSeq
+        >>> seq1 = Seq("ACGT")
+        >>> seq2 = Seq("ACGT")
+        >>> mutable_seq = MutableSeq("ACGT")
+        >>> seq1 == seq2
+        True
+        >>> seq1 == mutable_seq
+        True
+        >>> seq1 == "ACGT"
+        True
+
+        Note that the sequence objects themselves are not identical to each
+        other:
+
+        >>> id(seq1) == id(seq2)
+        False
+        >>> seq1 is seq2
+        False
+
+        Sequences can also be compared to strings, ``bytes``, and ``bytearray``
+        objects:
+
+        >>> seq1 == "ACGT"
+        True
+        >>> seq1 == b"ACGT"
+        True
+        >>> seq1 == bytearray(b"ACGT")
+        True
+        """
+        if isinstance(other, _SeqAbstractBaseClass):
+            return self._data == other._data
+        elif isinstance(other, str):
+            return self._data == other.encode("ASCII")
+        else:
+            return self._data == other
+
+    def __lt__(self, other):
+        """Implement the less-than operand."""
+        if isinstance(other, _SeqAbstractBaseClass):
+            return self._data < other._data
+        elif isinstance(other, str):
+            return self._data < other.encode("ASCII")
+        else:
+            return self._data < other
+
+    def __le__(self, other):
+        """Implement the less-than or equal operand."""
+        if isinstance(other, _SeqAbstractBaseClass):
+            return self._data <= other._data
+        elif isinstance(other, str):
+            return self._data <= other.encode("ASCII")
+        else:
+            return self._data <= other
+
+    def __gt__(self, other):
+        """Implement the greater-than operand."""
+        if isinstance(other, _SeqAbstractBaseClass):
+            return self._data > other._data
+        elif isinstance(other, str):
+            return self._data > other.encode("ASCII")
+        else:
+            return self._data > other
+
+    def __ge__(self, other):
+        """Implement the greater-than or equal operand."""
+        if isinstance(other, _SeqAbstractBaseClass):
+            return self._data >= other._data
+        elif isinstance(other, str):
+            return self._data >= other.encode("ASCII")
+        else:
+            return self._data >= other
+
+    def __len__(self):
+        """Return the length of the sequence."""
+        return len(self._data)
+
+    def __getitem__(self, index):
+        """Return a subsequence as a single letter or as a sequence object.
+
+        If the index is an integer, a single letter is returned as a Python
+        string:
+
+        >>> seq = Seq('ACTCGACGTCG')
+        >>> seq[5]
+        'A'
+
+        Otherwise, a new sequence object of the same class is returned:
+
+        >>> seq[5:8]
+        Seq('ACG')
+        >>> mutable_seq = MutableSeq('ACTCGACGTCG')
+        >>> mutable_seq[5:8]
+        MutableSeq('ACG')
+        """
+        if isinstance(index, int):
+            # Return a single letter as a string
+            return chr(self._data[index])
+        else:
+            # Return the (sub)sequence as another Seq/MutableSeq object
+            return self.__class__(self._data[index])
+
+    def __add__(self, other):
+        """Add a sequence or string to this sequence.
+
+        >>> from Bio.Seq import Seq, MutableSeq
+        >>> Seq("MELKI") + "LV"
+        Seq('MELKILV')
+        >>> MutableSeq("MELKI") + "LV"
+        MutableSeq('MELKILV')
+        """
+        if isinstance(other, _SeqAbstractBaseClass):
+            return self.__class__(self._data + other._data)
+        elif isinstance(other, str):
+            return self.__class__(self._data + other.encode("ASCII"))
+
+        from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
+
+        if isinstance(other, SeqRecord):
+            # Get the SeqRecord's __radd__ to handle this
+            return NotImplemented
+        else:
+            raise TypeError
+
+    def __radd__(self, other):
+        """Add a sequence string on the left.
+
+        >>> from Bio.Seq import Seq, MutableSeq
+        >>> "LV" + Seq("MELKI")
+        Seq('LVMELKI')
+        >>> "LV" + MutableSeq("MELKI")
+        MutableSeq('LVMELKI')
+
+        Adding two sequence objects is handled via the __add__ method.
+        """
+        if isinstance(other, str):
+            return self.__class__(other.encode("ASCII") + self._data)
+        else:
+            raise TypeError
+
+    def __mul__(self, other):
+        """Multiply sequence by integer.
+
+        >>> from Bio.Seq import Seq, MutableSeq
+        >>> Seq('ATG') * 2
+        Seq('ATGATG')
+        >>> MutableSeq('ATG') * 2
+        MutableSeq('ATGATG')
+        """
+        if not isinstance(other, int):
+            raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+        return self.__class__(self._data * other)
+
+    def __rmul__(self, other):
+        """Multiply integer by sequence.
+
+        >>> from Bio.Seq import Seq
+        >>> 2 * Seq('ATG')
+        Seq('ATGATG')
+        """
+        if not isinstance(other, int):
+            raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+        return self.__class__(self._data * other)
+
+    def __imul__(self, other):
+        """Multiply the sequence object by other and assign.
+
+        >>> from Bio.Seq import Seq
+        >>> seq = Seq('ATG')
+        >>> seq *= 2
+        >>> seq
+        Seq('ATGATG')
+
+        Note that this is different from in-place multiplication. The ``seq``
+        variable is reassigned to the multiplication result, but any variable
+        pointing to ``seq`` will remain unchanged:
+
+        >>> seq = Seq('ATG')
+        >>> seq2 = seq
+        >>> id(seq) == id(seq2)
+        True
+        >>> seq *= 2
+        >>> seq
+        Seq('ATGATG')
+        >>> seq2
+        Seq('ATG')
+        >>> id(seq) == id(seq2)
+        False
+        """
+        if not isinstance(other, int):
+            raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+        return self.__class__(self._data * other)
+
+    def count(self, sub, start=None, end=None):
+        """Return a non-overlapping count, like that of a python string.
+
+        The number of occurrences of substring argument sub in the
+        (sub)sequence given by [start:end] is returned as an integer.
+        Optional arguments start and end are interpreted as in slice
+        notation.
+
+        Arguments:
+         - sub - a string or another Seq object to look for
+         - start - optional integer, slice start
+         - end - optional integer, slice end
+
+        e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> my_seq = Seq("AAAATGA")
+        >>> print(my_seq.count("A"))
+        5
+        >>> print(my_seq.count("ATG"))
+        1
+        >>> print(my_seq.count(Seq("AT")))
+        1
+        >>> print(my_seq.count("AT", 2, -1))
+        1
+
+        HOWEVER, please note because the ``count`` method of Seq and MutableSeq
+        objects, like that of Python strings, do a non-overlapping search, this
+        may not give the answer you expect:
+
+        >>> "AAAA".count("AA")
+        2
+        >>> print(Seq("AAAA").count("AA"))
+        2
+
+        For an overlapping search, use the ``count_overlap`` method:
+
+        >>> print(Seq("AAAA").count_overlap("AA"))
+        3
+        """
+        if isinstance(sub, MutableSeq):
+            sub = sub._data
+        elif isinstance(sub, Seq):
+            sub = bytes(sub)
+        elif isinstance(sub, str):
+            sub = sub.encode("ASCII")
+        elif not isinstance(sub, (bytes, bytearray)):
+            raise TypeError(
+                "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+                % type(sub)
+            )
+        return self._data.count(sub, start, end)
+
+    def count_overlap(self, sub, start=None, end=None):
+        """Return an overlapping count.
+
+        Returns an integer, the number of occurrences of substring
+        argument sub in the (sub)sequence given by [start:end].
+        Optional arguments start and end are interpreted as in slice
+        notation.
+
+        Arguments:
+         - sub - a string or another Seq object to look for
+         - start - optional integer, slice start
+         - end - optional integer, slice end
+
+        e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> print(Seq("AAAA").count_overlap("AA"))
+        3
+        >>> print(Seq("ATATATATA").count_overlap("ATA"))
+        4
+        >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
+        1
+
+        For a non-overlapping search, use the ``count`` method:
+
+        >>> print(Seq("AAAA").count("AA"))
+        2
+
+        Where substrings do not overlap, ``count_overlap`` behaves the same as
+        the ``count`` method:
+
+        >>> from Bio.Seq import Seq
+        >>> my_seq = Seq("AAAATGA")
+        >>> print(my_seq.count_overlap("A"))
+        5
+        >>> my_seq.count_overlap("A") == my_seq.count("A")
+        True
+        >>> print(my_seq.count_overlap("ATG"))
+        1
+        >>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
+        True
+        >>> print(my_seq.count_overlap(Seq("AT")))
+        1
+        >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
+        True
+        >>> print(my_seq.count_overlap("AT", 2, -1))
+        1
+        >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
+        True
+
+        HOWEVER, do not use this method for such cases because the
+        count() method is much for efficient.
+        """
+        if isinstance(sub, MutableSeq):
+            sub = sub._data
+        elif isinstance(sub, Seq):
+            sub = bytes(sub)
+        elif isinstance(sub, str):
+            sub = sub.encode("ASCII")
+        elif not isinstance(sub, (bytes, bytearray)):
+            raise TypeError(
+                "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+                % type(sub)
+            )
+        data = self._data
+        overlap_count = 0
+        while True:
+            start = data.find(sub, start, end) + 1
+            if start != 0:
+                overlap_count += 1
+            else:
+                return overlap_count
+
+    def __contains__(self, item):
+        """Return True if item is a subsequence of the sequence, and False otherwise.
+
+        e.g.
+
+        >>> from Bio.Seq import Seq, MutableSeq
+        >>> my_dna = Seq("ATATGAAATTTGAAAA")
+        >>> "AAA" in my_dna
+        True
+        >>> Seq("AAA") in my_dna
+        True
+        >>> MutableSeq("AAA") in my_dna
+        True
+        """
+        if isinstance(item, _SeqAbstractBaseClass):
+            item = bytes(item)
+        elif isinstance(item, str):
+            item = item.encode("ASCII")
+        return item in self._data
+
+    def find(self, sub, start=None, end=None):
+        """Return the lowest index in the sequence where subsequence sub is found.
+
+        With optional arguments start and end, return the lowest index in the
+        sequence such that the subsequence sub is contained within the sequence
+        region [start:end].
+
+        Arguments:
+         - sub - a string or another Seq or MutableSeq object to search for
+         - start - optional integer, slice start
+         - end - optional integer, slice end
+
+        Returns -1 if the subsequence is NOT found.
+
+        e.g. Locating the first typical start codon, AUG, in an RNA sequence:
+
+        >>> from Bio.Seq import Seq
+        >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+        >>> my_rna.find("AUG")
+        3
+
+        The next typical start codon can then be found by starting the search
+        at position 4:
+
+        >>> my_rna.find("AUG", 4)
+        15
+        """
+        if isinstance(sub, _SeqAbstractBaseClass):
+            sub = bytes(sub)
+        elif isinstance(sub, str):
+            sub = sub.encode("ASCII")
+        elif not isinstance(sub, (bytes, bytearray)):
+            raise TypeError(
+                "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+                % type(sub)
+            )
+        return self._data.find(sub, start, end)
+
+    def rfind(self, sub, start=None, end=None):
+        """Return the highest index in the sequence where subsequence sub is found.
+
+        With optional arguments start and end, return the highest index in the
+        sequence such that the subsequence sub is contained within the sequence
+        region [start:end].
+
+        Arguments:
+         - sub - a string or another Seq or MutableSeq object to search for
+         - start - optional integer, slice start
+         - end - optional integer, slice end
+
+        Returns -1 if the subsequence is NOT found.
+
+        e.g. Locating the last typical start codon, AUG, in an RNA sequence:
+
+        >>> from Bio.Seq import Seq
+        >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+        >>> my_rna.rfind("AUG")
+        15
+
+        The location of the typical start codon before that can be found by
+        ending the search at positon 15:
+
+        >>> my_rna.rfind("AUG", end=15)
+        3
+        """
+        if isinstance(sub, _SeqAbstractBaseClass):
+            sub = bytes(sub)
+        elif isinstance(sub, str):
+            sub = sub.encode("ASCII")
+        elif not isinstance(sub, (bytes, bytearray)):
+            raise TypeError(
+                "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+                % type(sub)
+            )
+        return self._data.rfind(sub, start, end)
+
+    def index(self, sub, start=None, end=None):
+        """Return the lowest index in the sequence where subsequence sub is found.
+
+        With optional arguments start and end, return the lowest index in the
+        sequence such that the subsequence sub is contained within the sequence
+        region [start:end].
+
+        Arguments:
+         - sub - a string or another Seq or MutableSeq object to search for
+         - start - optional integer, slice start
+         - end - optional integer, slice end
+
+        Raises a ValueError if the subsequence is NOT found.
+
+        e.g. Locating the first typical start codon, AUG, in an RNA sequence:
+
+        >>> from Bio.Seq import Seq
+        >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+        >>> my_rna.index("AUG")
+        3
+
+        The next typical start codon can then be found by starting the search
+        at position 4:
+
+        >>> my_rna.index("AUG", 4)
+        15
+
+        This method performs the same search as the ``find`` method.  However,
+        if the subsequence is not found, ``find`` returns -1 which ``index``
+        raises a ValueError:
+
+        >>> my_rna.index("T")
+        Traceback (most recent call last):
+                   ...
+        ValueError: ...
+        >>> my_rna.find("T")
+        -1
+        """
+        if isinstance(sub, MutableSeq):
+            sub = sub._data
+        elif isinstance(sub, Seq):
+            sub = bytes(sub)
+        elif isinstance(sub, str):
+            sub = sub.encode("ASCII")
+        elif not isinstance(sub, (bytes, bytearray)):
+            raise TypeError(
+                "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+                % type(sub)
+            )
+        return self._data.index(sub, start, end)
+
+    def rindex(self, sub, start=None, end=None):
+        """Return the highest index in the sequence where subsequence sub is found.
+
+        With optional arguments start and end, return the highest index in the
+        sequence such that the subsequence sub is contained within the sequence
+        region [start:end].
+
+        Arguments:
+         - sub - a string or another Seq or MutableSeq object to search for
+         - start - optional integer, slice start
+         - end - optional integer, slice end
+
+        Returns -1 if the subsequence is NOT found.
+
+        e.g. Locating the last typical start codon, AUG, in an RNA sequence:
+
+        >>> from Bio.Seq import Seq
+        >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+        >>> my_rna.rindex("AUG")
+        15
+
+        The location of the typical start codon before that can be found by
+        ending the search at positon 15:
+
+        >>> my_rna.rindex("AUG", end=15)
+        3
+
+        This method performs the same search as the ``rfind`` method.  However,
+        if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
+        raises a ValueError:
+
+        >>> my_rna.rindex("T")
+        Traceback (most recent call last):
+                   ...
+        ValueError: ...
+        >>> my_rna.rfind("T")
+        -1
+        """
+        if isinstance(sub, MutableSeq):
+            sub = sub._data
+        elif isinstance(sub, Seq):
+            sub = bytes(sub)
+        elif isinstance(sub, str):
+            sub = sub.encode("ASCII")
+        elif not isinstance(sub, (bytes, bytearray)):
+            raise TypeError(
+                "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+                % type(sub)
+            )
+        return self._data.rindex(sub, start, end)
+
+    def startswith(self, prefix, start=None, end=None):
+        """Return True if the sequence starts with the given prefix, False otherwise.
+
+        Return True if the sequence starts with the specified prefix
+        (a string or another Seq object), False otherwise.
+        With optional start, test sequence beginning at that position.
+        With optional end, stop comparing sequence at that position.
+        prefix can also be a tuple of strings to try.  e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+        >>> my_rna.startswith("GUC")
+        True
+        >>> my_rna.startswith("AUG")
+        False
+        >>> my_rna.startswith("AUG", 3)
+        True
+        >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
+        True
+        """
+        if isinstance(prefix, tuple):
+            prefix = tuple(
+                bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
+                for p in prefix
+            )
+        elif isinstance(prefix, _SeqAbstractBaseClass):
+            prefix = bytes(prefix)
+        elif isinstance(prefix, str):
+            prefix = prefix.encode("ASCII")
+        return self._data.startswith(prefix, start, end)
+
+    def endswith(self, suffix, start=None, end=None):
+        """Return True if the sequence ends with the given suffix, False otherwise.
+
+        Return True if the sequence ends with the specified suffix
+        (a string or another Seq object), False otherwise.
+        With optional start, test sequence beginning at that position.
+        With optional end, stop comparing sequence at that position.
+        suffix can also be a tuple of strings to try.  e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+        >>> my_rna.endswith("UUG")
+        True
+        >>> my_rna.endswith("AUG")
+        False
+        >>> my_rna.endswith("AUG", 0, 18)
+        True
+        >>> my_rna.endswith(("UCC", "UCA", "UUG"))
+        True
+        """
+        if isinstance(suffix, tuple):
+            suffix = tuple(
+                bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
+                for p in suffix
+            )
+        elif isinstance(suffix, _SeqAbstractBaseClass):
+            suffix = bytes(suffix)
+        elif isinstance(suffix, str):
+            suffix = suffix.encode("ASCII")
+        return self._data.endswith(suffix, start, end)
+
+    def split(self, sep=None, maxsplit=-1):
+        """Return a list of subsequences when splitting the sequence by separator sep.
+
+        Return a list of the subsequences in the sequence (as Seq objects),
+        using sep as the delimiter string.  If maxsplit is given, at
+        most maxsplit splits are done.  If maxsplit is omitted, all
+        splits are made.
+
+        For consistency with the ``split`` method of Python strings, any
+        whitespace (tabs, spaces, newlines) is a separator if sep is None, the
+        default value
+
+        e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+        >>> my_aa = my_rna.translate()
+        >>> my_aa
+        Seq('VMAIVMGR*KGAR*L')
+        >>> for pep in my_aa.split("*"):
+        ...     pep
+        Seq('VMAIVMGR')
+        Seq('KGAR')
+        Seq('L')
+        >>> for pep in my_aa.split("*", 1):
+        ...     pep
+        Seq('VMAIVMGR')
+        Seq('KGAR*L')
+
+        See also the rsplit method, which splits the sequence starting from the
+        end:
+
+        >>> for pep in my_aa.rsplit("*", 1):
+        ...     pep
+        Seq('VMAIVMGR*KGAR')
+        Seq('L')
+        """
+        if isinstance(sep, _SeqAbstractBaseClass):
+            sep = bytes(sep)
+        elif isinstance(sep, str):
+            sep = sep.encode("ASCII")
+        return [Seq(part) for part in self._data.split(sep, maxsplit)]
+
+    def rsplit(self, sep=None, maxsplit=-1):
+        """Return a list of subsequences by splitting the sequence from the right.
+
+        Return a list of the subsequences in the sequence (as Seq objects),
+        using sep as the delimiter string.  If maxsplit is given, at
+        most maxsplit splits are done.  If maxsplit is omitted, all
+        splits are made.
+
+        For consistency with the ``rsplit`` method of Python strings, any
+        whitespace (tabs, spaces, newlines) is a separator if sep is None, the
+        default value
+
+        e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+        >>> my_aa = my_rna.translate()
+        >>> my_aa
+        Seq('VMAIVMGR*KGAR*L')
+        >>> for pep in my_aa.rsplit("*"):
+        ...     pep
+        Seq('VMAIVMGR')
+        Seq('KGAR')
+        Seq('L')
+        >>> for pep in my_aa.rsplit("*", 1):
+        ...     pep
+        Seq('VMAIVMGR*KGAR')
+        Seq('L')
+
+        See also the split method, which splits the sequence starting from the
+        beginning:
+
+        >>> for pep in my_aa.split("*", 1):
+        ...     pep
+        Seq('VMAIVMGR')
+        Seq('KGAR*L')
+        """
+        if isinstance(sep, _SeqAbstractBaseClass):
+            sep = bytes(sep)
+        elif isinstance(sep, str):
+            sep = sep.encode("ASCII")
+        return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
+
+    def strip(self, chars=None, inplace=False):
+        """Return a sequence object with leading and trailing ends stripped.
+
+        With default arguments, leading and trailing whitespace is removed:
+
+        >>> seq = Seq(" ACGT ")
+        >>> seq.strip()
+        Seq('ACGT')
+        >>> seq
+        Seq(' ACGT ')
+
+        If ``chars`` is given and not ``None``, remove characters in ``chars``
+        instead.  The order of the characters to be removed is not important:
+
+        >>> Seq("ACGTACGT").strip("TGCA")
+        Seq('')
+
+        A copy of the sequence is returned if ``inplace`` is ``False`` (the
+        default value).  If ``inplace`` is ``True``, the sequence is stripped
+        in-place and returned.
+
+        >>> seq = MutableSeq(" ACGT ")
+        >>> seq.strip(inplace=False)
+        MutableSeq('ACGT')
+        >>> seq
+        MutableSeq(' ACGT ')
+        >>> seq.strip(inplace=True)
+        MutableSeq('ACGT')
+        >>> seq
+        MutableSeq('ACGT')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
+        is called on a ``Seq`` object with ``inplace=True``.
+
+        See also the lstrip and rstrip methods.
+        """
+        if isinstance(chars, _SeqAbstractBaseClass):
+            chars = bytes(chars)
+        elif isinstance(chars, str):
+            chars = chars.encode("ASCII")
+        try:
+            data = self._data.strip(chars)
+        except TypeError:
+            raise TypeError(
+                "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
+            ) from None
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        elif isinstance(self, UnknownSeq):
+            return Seq(data)
+        else:
+            return self.__class__(data)
+
+    def lstrip(self, chars=None, inplace=False):
+        """Return a sequence object with leading and trailing ends stripped.
+
+        With default arguments, leading whitespace is removed:
+
+        >>> seq = Seq(" ACGT ")
+        >>> seq.lstrip()
+        Seq('ACGT ')
+        >>> seq
+        Seq(' ACGT ')
+
+        If ``chars`` is given and not ``None``, remove characters in ``chars``
+        from the leading end instead.  The order of the characters to be removed
+        is not important:
+
+        >>> Seq("ACGACGTTACG").lstrip("GCA")
+        Seq('TTACG')
+
+        A copy of the sequence is returned if ``inplace`` is ``False`` (the
+        default value).  If ``inplace`` is ``True``, the sequence is stripped
+        in-place and returned.
+
+        >>> seq = MutableSeq(" ACGT ")
+        >>> seq.lstrip(inplace=False)
+        MutableSeq('ACGT ')
+        >>> seq
+        MutableSeq(' ACGT ')
+        >>> seq.lstrip(inplace=True)
+        MutableSeq('ACGT ')
+        >>> seq
+        MutableSeq('ACGT ')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
+
+        See also the strip and rstrip methods.
+        """
+        if isinstance(chars, _SeqAbstractBaseClass):
+            chars = bytes(chars)
+        elif isinstance(chars, str):
+            chars = chars.encode("ASCII")
+        try:
+            data = self._data.lstrip(chars)
+        except TypeError:
+            raise TypeError(
+                "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
+            ) from None
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        elif isinstance(self, UnknownSeq):
+            return Seq(data)
+        else:
+            return self.__class__(data)
+
+    def rstrip(self, chars=None, inplace=False):
+        """Return a sequence object with trailing ends stripped.
+
+        With default arguments, trailing whitespace is removed:
+
+        >>> seq = Seq(" ACGT ")
+        >>> seq.rstrip()
+        Seq(' ACGT')
+        >>> seq
+        Seq(' ACGT ')
+
+        If ``chars`` is given and not ``None``, remove characters in ``chars``
+        from the trailing end instead.  The order of the characters to be
+        removed is not important:
+
+        >>> Seq("ACGACGTTACG").rstrip("GCA")
+        Seq('ACGACGTT')
+
+        A copy of the sequence is returned if ``inplace`` is ``False`` (the
+        default value).  If ``inplace`` is ``True``, the sequence is stripped
+        in-place and returned.
+
+        >>> seq = MutableSeq(" ACGT ")
+        >>> seq.rstrip(inplace=False)
+        MutableSeq(' ACGT')
+        >>> seq
+        MutableSeq(' ACGT ')
+        >>> seq.rstrip(inplace=True)
+        MutableSeq(' ACGT')
+        >>> seq
+        MutableSeq(' ACGT')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
+
+        See also the strip and lstrip methods.
+        """
+        if isinstance(chars, _SeqAbstractBaseClass):
+            chars = bytes(chars)
+        elif isinstance(chars, str):
+            chars = chars.encode("ASCII")
+        try:
+            data = self._data.rstrip(chars)
+        except TypeError:
+            raise TypeError(
+                "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
+            ) from None
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        elif isinstance(self, UnknownSeq):
+            return Seq(data)
+        else:
+            return self.__class__(data)
+
+    def upper(self, inplace=False):
+        """Return the sequence in upper case.
+
+        An upper-case copy of the sequence is returned if inplace is False,
+        the default value:
+
+        >>> from Bio.Seq import Seq, MutableSeq
+        >>> my_seq = Seq("VHLTPeeK*")
+        >>> my_seq
+        Seq('VHLTPeeK*')
+        >>> my_seq.lower()
+        Seq('vhltpeek*')
+        >>> my_seq.upper()
+        Seq('VHLTPEEK*')
+        >>> my_seq
+        Seq('VHLTPeeK*')
+
+        The sequence is modified in-place and returned if inplace is True:
+
+        >>> my_seq = MutableSeq("VHLTPeeK*")
+        >>> my_seq
+        MutableSeq('VHLTPeeK*')
+        >>> my_seq.lower()
+        MutableSeq('vhltpeek*')
+        >>> my_seq.upper()
+        MutableSeq('VHLTPEEK*')
+        >>> my_seq
+        MutableSeq('VHLTPeeK*')
+
+        >>> my_seq.lower(inplace=True)
+        MutableSeq('vhltpeek*')
+        >>> my_seq
+        MutableSeq('vhltpeek*')
+        >>> my_seq.upper(inplace=True)
+        MutableSeq('VHLTPEEK*')
+        >>> my_seq
+        MutableSeq('VHLTPEEK*')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``upper`` is called on a ``Seq`` object with ``inplace=True``.
+
+        See also the ``lower`` method.
+        """
+        data = self._data.upper()
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        else:
+            return self.__class__(data)
+
+    def lower(self, inplace=False):
+        """Return the sequence in lower case.
+
+        An lower-case copy of the sequence is returned if inplace is False,
+        the default value:
+
+        >>> from Bio.Seq import Seq, MutableSeq
+        >>> my_seq = Seq("VHLTPeeK*")
+        >>> my_seq
+        Seq('VHLTPeeK*')
+        >>> my_seq.lower()
+        Seq('vhltpeek*')
+        >>> my_seq.upper()
+        Seq('VHLTPEEK*')
+        >>> my_seq
+        Seq('VHLTPeeK*')
+
+        The sequence is modified in-place and returned if inplace is True:
+
+        >>> my_seq = MutableSeq("VHLTPeeK*")
+        >>> my_seq
+        MutableSeq('VHLTPeeK*')
+        >>> my_seq.lower()
+        MutableSeq('vhltpeek*')
+        >>> my_seq.upper()
+        MutableSeq('VHLTPEEK*')
+        >>> my_seq
+        MutableSeq('VHLTPeeK*')
+
+        >>> my_seq.lower(inplace=True)
+        MutableSeq('vhltpeek*')
+        >>> my_seq
+        MutableSeq('vhltpeek*')
+        >>> my_seq.upper(inplace=True)
+        MutableSeq('VHLTPEEK*')
+        >>> my_seq
+        MutableSeq('VHLTPEEK*')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``lower`` is called on a ``Seq`` object with ``inplace=True``.
+
+        See also the ``upper`` method.
+        """
+        data = self._data.lower()
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        else:
+            return self.__class__(data)
+
+    def translate(
+        self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
+    ):
+        """Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
+
+        This method will translate DNA or RNA sequences. It should not
+        be used on protein sequences as any result will be biologically
+        meaningless.
+
+        Arguments:
+         - table - Which codon table to use?  This can be either a name
+           (string), an NCBI identifier (integer), or a CodonTable
+           object (useful for non-standard genetic codes).  This
+           defaults to the "Standard" table.
+         - stop_symbol - Single character string, what to use for
+           terminators.  This defaults to the asterisk, "*".
+         - to_stop - Boolean, defaults to False meaning do a full
+           translation continuing on past any stop codons (translated as the
+           specified stop_symbol).  If True, translation is terminated at
+           the first in frame stop codon (and the stop_symbol is not
+           appended to the returned protein sequence).
+         - cds - Boolean, indicates this is a complete CDS.  If True,
+           this checks the sequence starts with a valid alternative start
+           codon (which will be translated as methionine, M), that the
+           sequence length is a multiple of three, and that there is a
+           single in frame stop codon at the end (this will be excluded
+           from the protein sequence, regardless of the to_stop option).
+           If these tests fail, an exception is raised.
+         - gap - Single character string to denote symbol used for gaps.
+           Defaults to the minus sign.
+
+        A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
+        object; a ``MutableSeq`` object is returned if ``translate`` is called
+        pn a ``MutableSeq`` object.
+
+        e.g. Using the standard table:
+
+        >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
+        >>> coding_dna.translate()
+        Seq('VAIVMGR*KGAR*')
+        >>> coding_dna.translate(stop_symbol="@")
+        Seq('VAIVMGR@KGAR@')
+        >>> coding_dna.translate(to_stop=True)
+        Seq('VAIVMGR')
+
+        Now using NCBI table 2, where TGA is not a stop codon:
+
+        >>> coding_dna.translate(table=2)
+        Seq('VAIVMGRWKGAR*')
+        >>> coding_dna.translate(table=2, to_stop=True)
+        Seq('VAIVMGRWKGAR')
+
+        In fact, GTG is an alternative start codon under NCBI table 2, meaning
+        this sequence could be a complete CDS:
+
+        >>> coding_dna.translate(table=2, cds=True)
+        Seq('MAIVMGRWKGAR')
+
+        It isn't a valid CDS under NCBI table 1, due to both the start codon
+        and also the in frame stop codons:
+
+        >>> coding_dna.translate(table=1, cds=True)
+        Traceback (most recent call last):
+            ...
+        Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
+
+        If the sequence has no in-frame stop codon, then the to_stop argument
+        has no effect:
+
+        >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
+        >>> coding_dna2.translate()
+        Seq('LAIVMGR')
+        >>> coding_dna2.translate(to_stop=True)
+        Seq('LAIVMGR')
+
+        NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
+        or a stop codon.  These are translated as "X".  Any invalid codon
+        (e.g. "TA?" or "T-A") will throw a TranslationError.
+
+        NOTE - This does NOT behave like the python string's translate
+        method.  For that use str(my_seq).translate(...) instead
+        """
+        if isinstance(table, str) and len(table) == 256:
+            raise ValueError(
+                "The MutableSeq object translate method DOES NOT "
+                "take a 256 character string mapping table like "
+                "the python string object's translate method. "
+                "Use str(my_seq).translate(...) instead."
+            )
+
+        try:
+            data = str(self)
+        except UndefinedSequenceError:
+            # translating an undefined sequence yields an undefined
+            # sequence with the length divided by 3
+            n = len(self)
+            if n % 3 != 0:
+                warnings.warn(
+                    "Partial codon, len(sequence) not a multiple of three. "
+                    "This may become an error in future.",
+                    BiopythonWarning,
+                )
+            return Seq(None, n // 3)
+
+        return self.__class__(
+            _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
+        )
+
+    def complement_rna(self, inplace=False):
+        """Return the complement as an RNA sequence.
+
+        >>> Seq("CGA").complement_rna()
+        Seq('GCU')
+
+        Any T in the sequence is treated as a U:
+
+        >>> Seq("CGAUT").complement_rna()
+        Seq('GCUAA')
+
+        In contrast, ``complement`` returns a DNA sequence by default:
+
+        >>> Seq("CGA").complement()
+        Seq('GCT')
+
+        The sequence is modified in-place and returned if inplace is True:
+
+        >>> my_seq = MutableSeq("CGA")
+        >>> my_seq
+        MutableSeq('CGA')
+        >>> my_seq.complement_rna()
+        MutableSeq('GCU')
+        >>> my_seq
+        MutableSeq('CGA')
+
+        >>> my_seq.complement_rna(inplace=True)
+        MutableSeq('GCU')
+        >>> my_seq
+        MutableSeq('GCU')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
+        """
+        try:
+            data = self._data.translate(_rna_complement_table)
+        except UndefinedSequenceError:
+            # complement of an undefined sequence is an undefined sequence
+            # of the same length
+            return self
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        return self.__class__(data)
+
+    def reverse_complement_rna(self, inplace=False):
+        """Return the reverse complement as an RNA sequence.
+
+        >>> Seq("CGA").reverse_complement_rna()
+        Seq('UCG')
+
+        Any T in the sequence is treated as a U:
+
+        >>> Seq("CGAUT").reverse_complement_rna()
+        Seq('AAUCG')
+
+        In contrast, ``reverse_complement`` returns a DNA sequence by default:
+
+        >>> Seq("CGA").reverse_complement()
+        Seq('TCG')
+
+        The sequence is modified in-place and returned if inplace is True:
+
+        >>> my_seq = MutableSeq("CGA")
+        >>> my_seq
+        MutableSeq('CGA')
+        >>> my_seq.reverse_complement_rna()
+        MutableSeq('UCG')
+        >>> my_seq
+        MutableSeq('CGA')
+
+        >>> my_seq.reverse_complement_rna(inplace=True)
+        MutableSeq('UCG')
+        >>> my_seq
+        MutableSeq('UCG')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``reverse_complement_rna`` is called on a ``Seq`` object with
+        ``inplace=True``.
+        """
+        try:
+            data = self._data.translate(_rna_complement_table)
+        except UndefinedSequenceError:
+            # reverse complement of an undefined sequence is an undefined sequence
+            # of the same length
+            return self
+        data = self._data.translate(_rna_complement_table)
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[::-1] = data
+            return self
+        return self.__class__(data[::-1])
+
+    def transcribe(self, inplace=False):
+        """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
+
+        >>> from Bio.Seq import Seq
+        >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
+        >>> coding_dna
+        Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+        >>> coding_dna.transcribe()
+        Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+
+        The sequence is modified in-place and returned if inplace is True:
+
+        >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
+        >>> sequence
+        MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+        >>> sequence.transcribe()
+        MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+        >>> sequence
+        MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+
+        >>> sequence.transcribe(inplace=True)
+        MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+        >>> sequence
+        MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
+
+        Trying to transcribe an RNA sequence has no effect.
+        If you have a nucleotide sequence which might be DNA or RNA
+        (or even a mixture), calling the transcribe method will ensure
+        any T becomes U.
+
+        Trying to transcribe a protein sequence will replace any
+        T for Threonine with U for Selenocysteine, which has no
+        biologically plausible rational.
+
+        >>> from Bio.Seq import Seq
+        >>> my_protein = Seq("MAIVMGRT")
+        >>> my_protein.transcribe()
+        Seq('MAIVMGRU')
+        """
+        try:
+            data = self._data.replace(b"T", b"U").replace(b"t", b"u")
+        except UndefinedSequenceError:
+            # transcribing an undefined sequence yields an undefined sequence
+            # of the same length
+            return self
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        return self.__class__(data)
+
+    def back_transcribe(self, inplace=False):
+        """Return the DNA sequence from an RNA sequence by creating a new Seq object.
+
+        >>> from Bio.Seq import Seq
+        >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
+        >>> messenger_rna
+        Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+        >>> messenger_rna.back_transcribe()
+        Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+
+        The sequence is modified in-place and returned if inplace is True:
+
+        >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
+        >>> sequence
+        MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+        >>> sequence.back_transcribe()
+        MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+        >>> sequence
+        MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+
+        >>> sequence.back_transcribe(inplace=True)
+        MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+        >>> sequence
+        MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
+
+        Trying to back-transcribe DNA has no effect, If you have a nucleotide
+        sequence which might be DNA or RNA (or even a mixture), calling the
+        back-transcribe method will ensure any U becomes T.
+
+        Trying to back-transcribe a protein sequence will replace any U for
+        Selenocysteine with T for Threonine, which is biologically meaningless.
+
+        >>> from Bio.Seq import Seq
+        >>> my_protein = Seq("MAIVMGRU")
+        >>> my_protein.back_transcribe()
+        Seq('MAIVMGRT')
+        """
+        try:
+            data = self._data.replace(b"U", b"T").replace(b"u", b"t")
+        except UndefinedSequenceError:
+            # back-transcribing an undefined sequence yields an undefined
+            # sequence of the same length
+            return self
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        return self.__class__(data)
+
+    def join(self, other):
+        """Return a merge of the sequences in other, spaced by the sequence from self.
+
+        Accepts a Seq object, MutableSeq object, or string (and iterates over
+        the letters), or an iterable containing Seq, MutableSeq, or string
+        objects. These arguments will be concatenated with the calling sequence
+        as the spacer:
+
+        >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
+        >>> concatenated
+        Seq('AAANNNNNTTTNNNNNPPP')
+
+        Joining the letters of a single sequence:
+
+        >>> Seq('NNNNN').join(Seq("ACGT"))
+        Seq('ANNNNNCNNNNNGNNNNNT')
+        >>> Seq('NNNNN').join("ACGT")
+        Seq('ANNNNNCNNNNNGNNNNNT')
+        """
+        if isinstance(other, _SeqAbstractBaseClass):
+            return self.__class__(str(self).join(str(other)))
+        elif isinstance(other, str):
+            return self.__class__(str(self).join(other))
+
+        from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
+
+        if isinstance(other, SeqRecord):
+            raise TypeError("Iterable cannot be a SeqRecord")
+
+        for c in other:
+            if isinstance(c, SeqRecord):
+                raise TypeError("Iterable cannot contain SeqRecords")
+            elif not isinstance(c, (str, _SeqAbstractBaseClass)):
+                raise TypeError(
+                    "Input must be an iterable of Seq objects, MutableSeq objects, or strings"
+                )
+        return self.__class__(str(self).join([str(_) for _ in other]))
+
+    def replace(self, old, new, inplace=False):
+        """Return a copy with all occurrences of subsequence old replaced by new.
+
+        >>> s = Seq("ACGTAACCGGTT")
+        >>> t = s.replace("AC", "XYZ")
+        >>> s
+        Seq('ACGTAACCGGTT')
+        >>> t
+        Seq('XYZGTAXYZCGGTT')
+
+        For mutable sequences, passing inplace=True will modify the sequence in place:
+
+        >>> m = MutableSeq("ACGTAACCGGTT")
+        >>> t = m.replace("AC", "XYZ")
+        >>> m
+        MutableSeq('ACGTAACCGGTT')
+        >>> t
+        MutableSeq('XYZGTAXYZCGGTT')
+
+        >>> m = MutableSeq("ACGTAACCGGTT")
+        >>> t = m.replace("AC", "XYZ", inplace=True)
+        >>> m
+        MutableSeq('XYZGTAXYZCGGTT')
+        >>> t
+        MutableSeq('XYZGTAXYZCGGTT')
+
+        As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+        ``replace`` is called on a ``Seq`` object with ``inplace=True``.
+        """
+        if isinstance(old, _SeqAbstractBaseClass):
+            old = bytes(old)
+        elif isinstance(old, str):
+            old = old.encode("ASCII")
+        if isinstance(new, _SeqAbstractBaseClass):
+            new = bytes(new)
+        elif isinstance(new, str):
+            new = new.encode("ASCII")
+        data = self._data.replace(old, new)
+        if inplace:
+            if not isinstance(self._data, bytearray):
+                raise TypeError("Sequence is immutable")
+            self._data[:] = data
+            return self
+        return self.__class__(data)
+
+
+class Seq(_SeqAbstractBaseClass):
+    """Read-only sequence object (essentially a string with biological methods).
+
+    Like normal python strings, our basic sequence object is immutable.
+    This prevents you from doing my_seq[5] = "A" for example, but does allow
+    Seq objects to be used as dictionary keys.
+
+    The Seq object provides a number of string like methods (such as count,
+    find, split and strip).
+
+    The Seq object also provides some biological methods, such as complement,
+    reverse_complement, transcribe, back_transcribe and translate (which are
+    not applicable to protein sequences).
+    """
+
+    def __init__(self, data, length=None):
+        """Create a Seq object.
+
+        Arguments:
+         - data - Sequence, required (string)
+         - length - Sequence length, used only if data is None (integer)
+
+        You will typically use Bio.SeqIO to read in sequences from files as
+        SeqRecord objects, whose sequence will be exposed as a Seq object via
+        the seq property.
+
+        However, you can also create a Seq object directly:
+
+        >>> from Bio.Seq import Seq
+        >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
+        >>> my_seq
+        Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
+        >>> print(my_seq)
+        MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+
+        To create a Seq object with for a sequence of known length but
+        unknown sequence contents, use None for the data argument and pass
+        the sequence length for the length argument. Trying to access the
+        sequence contents of a Seq object created in this way will raise
+        an UndefinedSequenceError:
+
+        >>> my_undefined_seq = Seq(None, 20)
+        >>> my_undefined_seq
+        Seq(None, length=20)
+        >>> len(my_undefined_seq)
+        20
+        >>> print(my_undefined_seq)
+        Traceback (most recent call last):
+        ...
+        Bio.Seq.UndefinedSequenceError: Sequence content is undefined
+        """
+        if length is None:
+            if isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
+                self._data = data
+            elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
+                self._data = bytes(data)
+            elif isinstance(data, str):
+                self._data = bytes(data, encoding="ASCII")
+            else:
+                raise TypeError(
+                    "data should be a string, bytes, bytearray, Seq, or MutableSeq object"
+                )
+        else:
+            if data is not None:
+                raise ValueError("length should be None if data is None")
+            self._data = _UndefinedSequenceData(length)
+
+    def __hash__(self):
+        """Hash of the sequence as a string for comparison.
+
+        See Seq object comparison documentation (method ``__eq__`` in
+        particular) as this has changed in Biopython 1.65. Older versions
+        would hash on object identity.
+        """
+        return hash(self._data)
+
+    def tomutable(self):
+        """Return the full sequence as a MutableSeq object.
+
+        >>> from Bio.Seq import Seq
+        >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL")
+        >>> my_seq
+        Seq('MKQHKAMIVALIVICITAVVAAL')
+        >>> my_seq.tomutable()
+        MutableSeq('MKQHKAMIVALIVICITAVVAAL')
+        """
+        warnings.warn(
+            "myseq.tomutable() is deprecated; please use MutableSeq(myseq) instead.",
+            BiopythonDeprecationWarning,
+        )
+        return MutableSeq(self)
+
+    def encode(self, encoding="utf-8", errors="strict"):
+        """Return an encoded version of the sequence as a bytes object.
+
+        The Seq object aims to match the interface of a Python string.
+
+        This is essentially to save you doing str(my_seq).encode() when
+        you need a bytes string, for example for computing a hash:
+
+        >>> from Bio.Seq import Seq
+        >>> Seq("ACGT").encode("ascii")
+        b'ACGT'
+        """
+        warnings.warn(
+            "myseq.encode has been deprecated; please use bytes(myseq) instead.",
+            BiopythonDeprecationWarning,
+        )
+        return str(self).encode(encoding, errors)
+
+    def complement(self):
+        """Return the complement sequence by creating a new Seq object.
+
+        This method is intended for use with DNA sequences:
+
+        >>> from Bio.Seq import Seq
+        >>> my_dna = Seq("CCCCCGATAG")
+        >>> my_dna
+        Seq('CCCCCGATAG')
+        >>> my_dna.complement()
+        Seq('GGGGGCTATC')
+
+        You can of course used mixed case sequences,
+
+        >>> from Bio.Seq import Seq
+        >>> my_dna = Seq("CCCCCgatA-GD")
+        >>> my_dna
+        Seq('CCCCCgatA-GD')
+        >>> my_dna.complement()
+        Seq('GGGGGctaT-CH')
+
+        Note in the above example, ambiguous character D denotes
+        G, A or T so its complement is H (for C, T or A).
+
+        Note that if the sequence contains neither T nor U, we
+        assume it is DNA and map any A character to T:
+
+        >>> Seq("CGA").complement()
+        Seq('GCT')
+        >>> Seq("CGAT").complement()
+        Seq('GCTA')
+
+        If you actually have RNA, this currently works but we
+        may deprecate this later. We recommend using the new
+        complement_rna method instead:
+
+        >>> Seq("CGAU").complement()
+        Seq('GCUA')
+        >>> Seq("CGAU").complement_rna()
+        Seq('GCUA')
+
+        If the sequence contains both T and U, an exception is
+        raised:
+
+        >>> Seq("CGAUT").complement()
+        Traceback (most recent call last):
+           ...
+        ValueError: Mixed RNA/DNA found
+
+        Trying to complement a protein sequence gives a meaningless
+        sequence:
+
+        >>> my_protein = Seq("MAIVMGR")
+        >>> my_protein.complement()
+        Seq('KTIBKCY')
+
+        Here "M" was interpreted as the IUPAC ambiguity code for
+        "A" or "C", with complement "K" for "T" or "G". Likewise
+        "A" has complement "T". The letter "I" has no defined
+        meaning under the IUPAC convention, and is unchanged.
+        """
+        if isinstance(self._data, _UndefinedSequenceData):
+            # complement of an undefined sequence is an undefined sequence
+            # of the same length
+            return self
+        if (b"U" in self._data or b"u" in self._data) and (
+            b"T" in self._data or b"t" in self._data
+        ):
+            # TODO - Handle this cleanly?
+            raise ValueError("Mixed RNA/DNA found")
+        elif b"U" in self._data or b"u" in self._data:
+            ttable = _rna_complement_table
+        else:
+            ttable = _dna_complement_table
+        # Much faster on really long sequences than the previous loop based
+        # one. Thanks to Michael Palmer, University of Waterloo.
+        return Seq(self._data.translate(ttable))
+
+    def reverse_complement(self):
+        """Return the reverse complement sequence by creating a new Seq object.
+
+        This method is intended for use with DNA sequences:
+
+        >>> from Bio.Seq import Seq
+        >>> my_dna = Seq("CCCCCGATAGNR")
+        >>> my_dna
+        Seq('CCCCCGATAGNR')
+        >>> my_dna.reverse_complement()
+        Seq('YNCTATCGGGGG')
+
+        Note in the above example, since R = G or A, its complement
+        is Y (which denotes C or T).
+
+        You can of course used mixed case sequences,
+
+        >>> from Bio.Seq import Seq
+        >>> my_dna = Seq("CCCCCgatA-G")
+        >>> my_dna
+        Seq('CCCCCgatA-G')
+        >>> my_dna.reverse_complement()
+        Seq('C-TatcGGGGG')
+
+        As discussed for the complement method, if the sequence
+        contains neither T nor U, is is assumed to be DNA and
+        will map any letter A to T.
+
+        If you are dealing with RNA you should use the new
+        reverse_complement_rna method instead
+
+        >>> Seq("CGA").reverse_complement()  # defaults to DNA
+        Seq('TCG')
+        >>> Seq("CGA").reverse_complement_rna()
+        Seq('UCG')
+
+        If the sequence contains both T and U, an exception is raised:
+
+        >>> Seq("CGAUT").reverse_complement()
+        Traceback (most recent call last):
+           ...
+        ValueError: Mixed RNA/DNA found
+
+        Trying to reverse complement a protein sequence will give
+        a meaningless sequence:
+
+        >>> from Bio.Seq import Seq
+        >>> my_protein = Seq("MAIVMGR")
+        >>> my_protein.reverse_complement()
+        Seq('YCKBITK')
+
+        Here "M" was interpretted as the IUPAC ambiguity code for
+        "A" or "C", with complement "K" for "T" or "G" - and so on.
+        """
+        # Use -1 stride/step to reverse the complement
+        return self.complement()[::-1]
+
+    def ungap(self, gap="-"):
+        """Return a copy of the sequence without the gap character(s) (OBSOLETE).
+
+        The gap character now defaults to the minus sign, and can only
+        be specified via the method argument. This is no longer possible
+        via the sequence's alphabet (as was possible up to Biopython 1.77):
+
+        >>> from Bio.Seq import Seq
+        >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA")
+        >>> my_dna
+        Seq('-ATA--TGAAAT-TTGAAAA')
+        >>> my_dna.ungap("-")
+        Seq('ATATGAAATTTGAAAA')
+
+        This method is OBSOLETE; please use my_dna.replace(gap, "") instead.
+        """
+        if not gap:
+            raise ValueError("Gap character required.")
+        elif len(gap) != 1 or not isinstance(gap, str):
+            raise ValueError(f"Unexpected gap character, {gap!r}")
+        return self.replace(gap, b"")
+
+
+class UnknownSeq(Seq):
+    """Read-only sequence object of known length but unknown contents (DEPRECATED).
+
+    If you have an unknown sequence, you can represent this with a normal
+    Seq object, for example:
+
+    >>> my_seq = Seq("N"*5)
+    >>> my_seq
+    Seq('NNNNN')
+    >>> len(my_seq)
+    5
+    >>> print(my_seq)
+    NNNNN
+
+    However, this is rather wasteful of memory (especially for large
+    sequences), which is where this class is most useful:
+
+    >>> unk_five = UnknownSeq(5)
+    >>> unk_five
+    UnknownSeq(5, character='?')
+    >>> len(unk_five)
+    5
+    >>> print(unk_five)
+    ?????
+
+    You can add unknown sequence together. Provided the characters are the
+    same, you get another memory saving UnknownSeq:
+
+    >>> unk_four = UnknownSeq(4)
+    >>> unk_four
+    UnknownSeq(4, character='?')
+    >>> unk_four + unk_five
+    UnknownSeq(9, character='?')
+
+    If the characters are different, addition gives an ordinary Seq object:
+
+    >>> unk_nnnn = UnknownSeq(4, character="N")
+    >>> unk_nnnn
+    UnknownSeq(4, character='N')
+    >>> unk_nnnn + unk_four
+    Seq('NNNN????')
+
+    Combining with a real Seq gives a new Seq object:
+
+    >>> known_seq = Seq("ACGT")
+    >>> unk_four + known_seq
+    Seq('????ACGT')
+    >>> known_seq + unk_four
+    Seq('ACGT????')
+
+    Although originally intended for unknown sequences (thus the class name),
+    this can be used for homopolymer sequences like AAAAAA, and the biological
+    methods will respect this:
+
+    >>> homopolymer = UnknownSeq(6, character="A")
+    >>> homopolymer.complement()
+    UnknownSeq(6, character='T')
+    >>> homopolymer.complement_rna()
+    UnknownSeq(6, character='U')
+    >>> homopolymer.translate()
+    UnknownSeq(2, character='K')
+    """
+
+    def __init__(self, length, alphabet=None, character="?"):
+        """Create a new UnknownSeq object.
+
+        Arguments:
+         - length - Integer, required.
+         - alphabet - no longer used, must be None.
+         - character - single letter string, default "?". Typically "N"
+           for nucleotides, "X" for proteins, and "?" otherwise.
+        """
+        warnings.warn(
+            "UnknownSeq(length) is deprecated; please use Seq(None, length) instead.",
+            BiopythonDeprecationWarning,
+        )
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        self._length = int(length)
+        if self._length < 0:
+            # TODO - Block zero length UnknownSeq?  You can just use a Seq!
+            raise ValueError("Length must not be negative.")
+        if not character or len(character) != 1:
+            raise ValueError("character argument should be a single letter string.")
+        self._character = character
+
+    def __len__(self):
+        """Return the stated length of the unknown sequence."""
+        return self._length
+
+    def __bytes__(self):
+        """Return the unknown sequence as full string of the given length."""
+        return self._character.encode("ASCII") * self._length
+
+    @property
+    def _data(self):
+        return self._character.encode("ASCII") * self._length
+
+    def __str__(self):
+        """Return the unknown sequence as full string of the given length."""
+        return self._character * self._length
+
+    def __repr__(self):
+        """Return (truncated) representation of the sequence for debugging."""
+        return f"UnknownSeq({self._length}, character={self._character!r})"
+
+    def __add__(self, other):
+        """Add another sequence or string to this sequence.
+
+        Adding two UnknownSeq objects returns another UnknownSeq object
+        provided the character is the same.
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> UnknownSeq(10, character='X') + UnknownSeq(5, character='X')
+        UnknownSeq(15, character='X')
+
+        If the characters differ, an UnknownSeq object cannot be used, so a
+        Seq object is returned:
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> UnknownSeq(10, character='X') + UnknownSeq(5, character="x")
+        Seq('XXXXXXXXXXxxxxx')
+
+        If adding a string to an UnknownSeq, a new Seq is returned:
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> UnknownSeq(5, character='X') + "LV"
+        Seq('XXXXXLV')
+        """
+        if isinstance(other, UnknownSeq) and other._character == self._character:
+            return UnknownSeq(len(self) + len(other), character=self._character)
+        # Offload to the base class...
+        return Seq(bytes(self)) + other
+
+    def __radd__(self, other):
+        """Add a sequence on the left."""
+        # If other is an UnknownSeq, then __add__ would be called.
+        # Offload to the base class...
+        return other + Seq(bytes(self))
+
+    def __mul__(self, other):
+        """Multiply UnknownSeq by integer.
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> UnknownSeq(3) * 2
+        UnknownSeq(6, character='?')
+        >>> UnknownSeq(3, character="N") * 2
+        UnknownSeq(6, character='N')
+        """
+        if not isinstance(other, int):
+            raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+        return self.__class__(len(self) * other, character=self._character)
+
+    def __rmul__(self, other):
+        """Multiply integer by UnknownSeq.
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> 2 * UnknownSeq(3)
+        UnknownSeq(6, character='?')
+        >>> 2 * UnknownSeq(3, character="N")
+        UnknownSeq(6, character='N')
+        """
+        if not isinstance(other, int):
+            raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+        return self.__class__(len(self) * other, character=self._character)
+
+    def __imul__(self, other):
+        """Multiply UnknownSeq in-place.
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> seq = UnknownSeq(3, character="N")
+        >>> seq *= 2
+        >>> seq
+        UnknownSeq(6, character='N')
+        """
+        if not isinstance(other, int):
+            raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+        return self.__class__(len(self) * other, character=self._character)
+
+    def __getitem__(self, index):
+        """Get a subsequence from the UnknownSeq object.
+
+        >>> unk = UnknownSeq(8, character="N")
+        >>> print(unk[:])
+        NNNNNNNN
+        >>> print(unk[5:3])
+        
+        >>> print(unk[1:-1])
+        NNNNNN
+        >>> print(unk[1:-1:2])
+        NNN
+        """
+        if isinstance(index, int):
+            if index >= -self._length and index < self._length:
+                return self._character
+            raise IndexError("sequence index out of range")
+        start, stop, stride = index.indices(self._length)
+        length = len(range(start, stop, stride))
+        return UnknownSeq(length, character=self._character)
+
+    def count(self, sub, start=None, end=None):
+        """Return a non-overlapping count, like that of a python string.
+
+        This behaves like the python string (and Seq object) method of the
+        same name, which does a non-overlapping count!
+
+        For an overlapping search use the newer count_overlap() method.
+
+        Returns an integer, the number of occurrences of substring
+        argument sub in the (sub)sequence given by [start:end].
+        Optional arguments start and end are interpreted as in slice
+        notation.
+
+        Arguments:
+         - sub - a string or another Seq object to look for
+         - start - optional integer, slice start
+         - end - optional integer, slice end
+
+        >>> "NNNN".count("N")
+        4
+        >>> Seq("NNNN").count("N")
+        4
+        >>> UnknownSeq(4, character="N").count("N")
+        4
+        >>> UnknownSeq(4, character="N").count("A")
+        0
+        >>> UnknownSeq(4, character="N").count("AA")
+        0
+
+        HOWEVER, please note because that python strings and Seq objects (and
+        MutableSeq objects) do a non-overlapping search, this may not give
+        the answer you expect:
+
+        >>> UnknownSeq(4, character="N").count("NN")
+        2
+        >>> UnknownSeq(4, character="N").count("NNN")
+        1
+        """
+        if isinstance(sub, _SeqAbstractBaseClass):
+            sub = str(sub)
+        elif not isinstance(sub, str):
+            raise TypeError(
+                "a Seq, MutableSeq, or string object is required, not '%s'" % type(sub)
+            )
+        # Handling case where subsequence not in self
+        if set(sub) != set(self._character):
+            return 0
+        start, stop, stride = slice(start, end, len(sub)).indices(self._length)
+        return len(range(start, stop - len(sub) + 1, stride))
+
+    def count_overlap(self, sub, start=None, end=None):
+        """Return an overlapping count.
+
+        For a non-overlapping search use the count() method.
+
+        Returns an integer, the number of occurrences of substring
+        argument sub in the (sub)sequence given by [start:end].
+        Optional arguments start and end are interpreted as in slice
+        notation.
+
+        Arguments:
+         - sub - a string or another Seq object to look for
+         - start - optional integer, slice start
+         - end - optional integer, slice end
+
+        e.g.
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> UnknownSeq(4, character="N").count_overlap("NN")
+        3
+        >>> UnknownSeq(4, character="N").count_overlap("NNN")
+        2
+
+        Where substrings do not overlap, should behave the same as
+        the count() method:
+
+        >>> UnknownSeq(4, character="N").count_overlap("N")
+        4
+        >>> UnknownSeq(4, character="N").count_overlap("N") == UnknownSeq(4, character="N").count("N")
+        True
+        >>> UnknownSeq(4, character="N").count_overlap("A")
+        0
+        >>> UnknownSeq(4, character="N").count_overlap("A") == UnknownSeq(4, character="N").count("A")
+        True
+        >>> UnknownSeq(4, character="N").count_overlap("AA")
+        0
+        >>> UnknownSeq(4, character="N").count_overlap("AA") == UnknownSeq(4, character="N").count("AA")
+        True
+        """
+        if isinstance(sub, _SeqAbstractBaseClass):
+            sub = str(sub)
+        elif not isinstance(sub, str):
+            raise TypeError(
+                "a Seq, MutableSeq, or string object is required, not '%s'" % type(sub)
+            )
+        # Handling case where subsequence not in self
+        if set(sub) != set(self._character):
+            return 0
+        start, stop, stride = slice(start, end).indices(self._length)
+        return len(range(start, stop - len(sub) + 1, stride))
+
+    def complement(self):
+        """Return the complement assuming it is DNA.
+
+        In typical usage this will return the same unknown sequence:
+
+        >>> my_nuc = UnknownSeq(8, character='N')
+        >>> my_nuc
+        UnknownSeq(8, character='N')
+        >>> print(my_nuc)
+        NNNNNNNN
+        >>> my_nuc.complement()
+        UnknownSeq(8, character='N')
+        >>> print(my_nuc.complement())
+        NNNNNNNN
+
+        If your sequence isn't actually unknown, and has a nucleotide letter
+        other than N, the appropriate DNA complement base is used:
+
+        >>> UnknownSeq(8, character="A").complement()
+        UnknownSeq(8, character='T')
+        """
+        s = complement(self._character)
+        return UnknownSeq(self._length, character=s)
+
+    def complement_rna(self):
+        """Return the complement assuming it is RNA.
+
+        In typical usage this will return the same unknown sequence. If your
+        sequence isn't actually unknown, the appropriate RNA complement base
+        is used:
+
+        >>> UnknownSeq(8, character="A").complement_rna()
+        UnknownSeq(8, character='U')
+        """
+        s = complement_rna(self._character)
+        return UnknownSeq(self._length, character=s)
+
+    def reverse_complement(self):
+        """Return the reverse complement assuming it is DNA.
+
+        In typical usage this will return the same unknown sequence:
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> example = UnknownSeq(6, character="N")
+        >>> print(example)
+        NNNNNN
+        >>> print(example.reverse_complement())
+        NNNNNN
+
+        If your sequence isn't actually unknown, the appropriate DNA
+        complement base is used:
+
+        >>> UnknownSeq(8, character="A").reverse_complement()
+        UnknownSeq(8, character='T')
+        """
+        return self.complement()
+
+    def reverse_complement_rna(self):
+        """Return the reverse complement assuming it is RNA.
+
+        In typical usage this will return the same unknown sequence. If your
+        sequence isn't actually unknown, the appropriate RNA complement base
+        is used:
+
+        >>> UnknownSeq(8, character="A").reverse_complement_rna()
+        UnknownSeq(8, character='U')
+        """
+        return self.complement_rna()
+
+    def transcribe(self):
+        """Return an unknown RNA sequence from an unknown DNA sequence.
+
+        >>> my_dna = UnknownSeq(10, character="N")
+        >>> my_dna
+        UnknownSeq(10, character='N')
+        >>> print(my_dna)
+        NNNNNNNNNN
+        >>> my_rna = my_dna.transcribe()
+        >>> my_rna
+        UnknownSeq(10, character='N')
+        >>> print(my_rna)
+        NNNNNNNNNN
+
+        In typical usage this will return the same unknown sequence. If your
+        sequence isn't actually unknown, but a homopolymer of T, the standard
+        DNA to RNA transcription is done, replacing T with U:
+
+        >>> UnknownSeq(9, character="t").transcribe()
+        UnknownSeq(9, character='u')
+        """
+        s = transcribe(self._character)
+        return UnknownSeq(self._length, character=s)
+
+    def back_transcribe(self):
+        """Return an unknown DNA sequence from an unknown RNA sequence.
+
+        >>> my_rna = UnknownSeq(20, character="N")
+        >>> my_rna
+        UnknownSeq(20, character='N')
+        >>> print(my_rna)
+        NNNNNNNNNNNNNNNNNNNN
+        >>> my_dna = my_rna.back_transcribe()
+        >>> my_dna
+        UnknownSeq(20, character='N')
+        >>> print(my_dna)
+        NNNNNNNNNNNNNNNNNNNN
+
+        In typical usage this will return the same unknown sequence. If your
+        sequence is actually a U homopolymer, the standard RNA to DNA back
+        translation applies, replacing U with T:
+
+        >>> UnknownSeq(9, character="U").back_transcribe()
+        UnknownSeq(9, character='T')
+        """
+        s = back_transcribe(self._character)
+        return UnknownSeq(self._length, character=s)
+
+    def upper(self):
+        """Return an upper case copy of the sequence.
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> my_seq = UnknownSeq(20, character="n")
+        >>> my_seq
+        UnknownSeq(20, character='n')
+        >>> print(my_seq)
+        nnnnnnnnnnnnnnnnnnnn
+        >>> my_seq.upper()
+        UnknownSeq(20, character='N')
+        >>> print(my_seq.upper())
+        NNNNNNNNNNNNNNNNNNNN
+
+        See also the lower method.
+        """
+        return UnknownSeq(self._length, character=self._character.upper())
+
+    def lower(self):
+        """Return a lower case copy of the sequence.
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> my_seq = UnknownSeq(20, character="X")
+        >>> my_seq
+        UnknownSeq(20, character='X')
+        >>> print(my_seq)
+        XXXXXXXXXXXXXXXXXXXX
+        >>> my_seq.lower()
+        UnknownSeq(20, character='x')
+        >>> print(my_seq.lower())
+        xxxxxxxxxxxxxxxxxxxx
+
+        See also the upper method.
+        """
+        return UnknownSeq(self._length, character=self._character.lower())
+
+    def translate(
+        self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
+    ):
+        """Translate an unknown nucleotide sequence into an unknown protein.
+
+        If your sequence makes sense as codons (e.g. a poly-A tail AAAAAA),
+        it will be translated accordingly:
+
+        >>> UnknownSeq(7, character='A').translate()
+        UnknownSeq(2, character='K')
+
+        Otherwise, it will be translated as X for unknown amino acid:
+
+        >>> UnknownSeq(7).translate()
+        UnknownSeq(2, character='X')
+        """
+        try:
+            s = translate(
+                self._character * 3,
+                table=table,
+                stop_symbol=stop_symbol,
+                to_stop=to_stop,
+                cds=cds,
+                gap=gap,
+            )
+        except CodonTable.TranslationError:
+            # Preserve historic behaviour, ??? (default character) and XXX -> X
+            s = "X"
+        # Don't worry about to_stop - no known stop codon is three bases the same,
+        return UnknownSeq(self._length // 3, character=s)
+
+    def ungap(self, gap="-"):
+        """Return a copy of the sequence without the gap character(s).
+
+        The gap character now defaults to the minus sign, and can only
+        be specified via the method argument. This is no longer possible
+        via the sequence's alphabet (as was possible up to Biopython 1.77):
+
+        >>> from Bio.Seq import UnknownSeq
+        >>> my_dna = UnknownSeq(20, character='N')
+        >>> my_dna
+        UnknownSeq(20, character='N')
+        >>> my_dna.ungap()  # using default
+        UnknownSeq(20, character='N')
+        >>> my_dna.ungap("-")
+        UnknownSeq(20, character='N')
+
+        If the UnknownSeq is using the gap character, then an empty Seq is
+        returned:
+
+        >>> my_gap = UnknownSeq(20, character="-")
+        >>> my_gap
+        UnknownSeq(20, character='-')
+        >>> my_gap.ungap()  # using default
+        Seq('')
+        >>> my_gap.ungap("-")
+        Seq('')
+        """
+        if self._character == gap:
+            return Seq("")
+        else:
+            return UnknownSeq(self._length, character=self._character)
+
+    def join(self, other):
+        """Return a merge of the sequences in other, spaced by the sequence from self.
+
+        Accepts either a Seq or string (and iterates over the letters), or an
+        iterable containing Seq or string objects. These arguments will be
+        concatenated with the calling sequence as the spacer:
+
+        >>> concatenated = UnknownSeq(5).join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
+        >>> concatenated
+        Seq('AAA?????TTT?????PPP')
+
+        If all the inputs are also UnknownSeq using the same character, then it
+        returns a new UnknownSeq:
+
+        >>> UnknownSeq(5).join([UnknownSeq(3), UnknownSeq(3), UnknownSeq(3)])
+        UnknownSeq(19, character='?')
+
+        Examples taking a single sequence and joining the letters:
+
+        >>> UnknownSeq(3).join("ACGT")
+        Seq('A???C???G???T')
+        >>> UnknownSeq(3).join(UnknownSeq(4))
+        UnknownSeq(13, character='?')
+
+        Will only return an UnknownSeq object if all of the objects to be joined are
+        also UnknownSeqs with the same character as the spacer, similar to how the
+        addition of an UnknownSeq and another UnknownSeq would work.
+        """
+        from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
+
+        if isinstance(other, (str, _SeqAbstractBaseClass)):
+            if isinstance(other, UnknownSeq) and self._character == other._character:
+                # Special case, can return an UnknownSeq
+                return self.__class__(
+                    len(other) + len(self) * (len(other) - 1), character=self._character
+                )
+            return Seq(str(self).join(str(other)))
+        if isinstance(other, SeqRecord):
+            raise TypeError("Iterable cannot be a SeqRecord")
+
+        for c in other:
+            if isinstance(c, SeqRecord):
+                raise TypeError("Iterable cannot contain SeqRecords")
+            elif not isinstance(c, (str, _SeqAbstractBaseClass)):
+                raise TypeError("Input must be an iterable of Seqs or Strings")
+        temp_data = str(self).join([str(_) for _ in other])
+        if temp_data.count(self._character) == len(temp_data):
+            # Can return an UnknownSeq
+            return self.__class__(len(temp_data), character=self._character)
+        return Seq(temp_data)
+
+
+class MutableSeq(_SeqAbstractBaseClass):
+    """An editable sequence object.
+
+    Unlike normal python strings and our basic sequence object (the Seq class)
+    which are immutable, the MutableSeq lets you edit the sequence in place.
+    However, this means you cannot use a MutableSeq object as a dictionary key.
+
+    >>> from Bio.Seq import MutableSeq
+    >>> my_seq = MutableSeq("ACTCGTCGTCG")
+    >>> my_seq
+    MutableSeq('ACTCGTCGTCG')
+    >>> my_seq[5]
+    'T'
+    >>> my_seq[5] = "A"
+    >>> my_seq
+    MutableSeq('ACTCGACGTCG')
+    >>> my_seq[5]
+    'A'
+    >>> my_seq[5:8] = "NNN"
+    >>> my_seq
+    MutableSeq('ACTCGNNNTCG')
+    >>> len(my_seq)
+    11
+
+    Note that the MutableSeq object does not support as many string-like
+    or biological methods as the Seq object.
+    """
+
+    def __init__(self, data):
+        """Create a MutableSeq object."""
+        if isinstance(data, array.array):
+            if data.typecode != "u":
+                raise ValueError(
+                    "data should be a string, array of characters, Seq object, "
+                    "or MutableSeq object"
+                )
+            warnings.warn(
+                "Initializing a MutableSeq by an array has been deprecated; please "
+                "use a bytearray object instead.",
+                BiopythonDeprecationWarning,
+            )
+            data = data.tounicode()
+        if isinstance(data, bytearray):
+            self._data = data
+        elif isinstance(data, bytes):
+            self._data = bytearray(data)
+        elif isinstance(data, str):
+            self._data = bytearray(data, "ASCII")
+        elif isinstance(data, MutableSeq):
+            self._data = data._data[:]  # Take a copy
+        elif isinstance(data, Seq):
+            # Make no assumptions about the Seq subclass internal storage
+            self._data = bytearray(bytes(data))
+        else:
+            raise TypeError(
+                "data should be a string, bytearray object, Seq object, or a "
+                "MutableSeq object"
+            )
+
+    @property
+    def data(self):
+        """Get the data."""
+        warnings.warn(
+            "Accessing MutableSeq.data has been deprecated, as it is now a private "
+            "attribute. Please use indexing to access the sequence contents of "
+            "a MutableSeq object.",
+            BiopythonDeprecationWarning,
+        )
+        return array.array("u", self._data.decode("ASCII"))
+
+    @data.setter
+    def data(self, value):
+        """Set the data."""
+        warnings.warn(
+            "Accessing MutableSeq.data has been deprecated, as it is now a private "
+            "attribute. Please use indexing to access the sequence contents of "
+            "a MutableSeq object.",
+            BiopythonDeprecationWarning,
+        )
+        self.__init__(value)
+
+    def __setitem__(self, index, value):
+        """Set a subsequence of single letter via value parameter.
+
+        >>> my_seq = MutableSeq('ACTCGACGTCG')
+        >>> my_seq[0] = 'T'
+        >>> my_seq
+        MutableSeq('TCTCGACGTCG')
+        """
+        if isinstance(index, int):
+            # Replacing a single letter with a new string
+            self._data[index] = ord(value)
+        else:
+            # Replacing a sub-sequence
+            if isinstance(value, MutableSeq):
+                self._data[index] = value._data
+            elif isinstance(value, Seq):
+                self._data[index] = bytes(value)
+            elif isinstance(value, str):
+                self._data[index] = value.encode("ASCII")
+            else:
+                raise TypeError("received unexpected type %s" % type(value))
+
+    def __delitem__(self, index):
+        """Delete a subsequence of single letter.
+
+        >>> my_seq = MutableSeq('ACTCGACGTCG')
+        >>> del my_seq[0]
+        >>> my_seq
+        MutableSeq('CTCGACGTCG')
+        """
+        # Could be deleting a single letter, or a slice
+        del self._data[index]
+
+    def append(self, c):
+        """Add a subsequence to the mutable sequence object.
+
+        >>> my_seq = MutableSeq('ACTCGACGTCG')
+        >>> my_seq.append('A')
+        >>> my_seq
+        MutableSeq('ACTCGACGTCGA')
+
+        No return value.
+        """
+        self._data.append(ord(c.encode("ASCII")))
+
+    def insert(self, i, c):
+        """Add a subsequence to the mutable sequence object at a given index.
+
+        >>> my_seq = MutableSeq('ACTCGACGTCG')
+        >>> my_seq.insert(0,'A')
+        >>> my_seq
+        MutableSeq('AACTCGACGTCG')
+        >>> my_seq.insert(8,'G')
+        >>> my_seq
+        MutableSeq('AACTCGACGGTCG')
+
+        No return value.
+        """
+        self._data.insert(i, ord(c.encode("ASCII")))
+
+    def pop(self, i=(-1)):
+        """Remove a subsequence of a single letter at given index.
+
+        >>> my_seq = MutableSeq('ACTCGACGTCG')
+        >>> my_seq.pop()
+        'G'
+        >>> my_seq
+        MutableSeq('ACTCGACGTC')
+        >>> my_seq.pop()
+        'C'
+        >>> my_seq
+        MutableSeq('ACTCGACGT')
+
+        Returns the last character of the sequence.
+        """
+        c = self._data[i]
+        del self._data[i]
+        return chr(c)
+
+    def remove(self, item):
+        """Remove a subsequence of a single letter from mutable sequence.
+
+        >>> my_seq = MutableSeq('ACTCGACGTCG')
+        >>> my_seq.remove('C')
+        >>> my_seq
+        MutableSeq('ATCGACGTCG')
+        >>> my_seq.remove('A')
+        >>> my_seq
+        MutableSeq('TCGACGTCG')
+
+        No return value.
+        """
+        codepoint = ord(item)
+        try:
+            self._data.remove(codepoint)
+        except ValueError:
+            raise ValueError("value not found in MutableSeq") from None
+
+    def reverse(self):
+        """Modify the mutable sequence to reverse itself.
+
+        No return value.
+        """
+        self._data.reverse()
+
+    def complement(self):
+        """Modify the mutable sequence to take on its complement.
+
+        No return value.
+
+        If the sequence contains neither T nor U, DNA is assumed
+        and any A will be mapped to T.
+
+        If the sequence contains both T and U, an exception is raised.
+        """
+        if ord("U") in self._data and ord("T") in self._data:
+            raise ValueError("Mixed RNA/DNA found")
+        elif ord("U") in self._data:
+            table = _rna_complement_table
+        else:
+            table = _dna_complement_table
+        self._data = self._data.translate(table)
+
+    def reverse_complement(self):
+        """Modify the mutable sequence to take on its reverse complement.
+
+        No return value.
+        """
+        self.complement()
+        self._data.reverse()
+
+    def extend(self, other):
+        """Add a sequence to the original mutable sequence object.
+
+        >>> my_seq = MutableSeq('ACTCGACGTCG')
+        >>> my_seq.extend('A')
+        >>> my_seq
+        MutableSeq('ACTCGACGTCGA')
+        >>> my_seq.extend('TTT')
+        >>> my_seq
+        MutableSeq('ACTCGACGTCGATTT')
+
+        No return value.
+        """
+        if isinstance(other, MutableSeq):
+            self._data.extend(other._data)
+        elif isinstance(other, Seq):
+            self._data.extend(bytes(other))
+        elif isinstance(other, str):
+            self._data.extend(other.encode("ASCII"))
+        else:
+            raise TypeError("expected a string, Seq or MutableSeq")
+
+    def toseq(self):
+        """Return the full sequence as a new immutable Seq object.
+
+        >>> from Bio.Seq import MutableSeq
+        >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL")
+        >>> my_mseq
+        MutableSeq('MKQHKAMIVALIVICITAVVAAL')
+        >>> my_mseq.toseq()
+        Seq('MKQHKAMIVALIVICITAVVAAL')
+        """
+        warnings.warn(
+            "myseq.toseq() is deprecated; please use Seq(myseq) instead.",
+            BiopythonDeprecationWarning,
+        )
+        return Seq(self)
+
+
+class UndefinedSequenceError(ValueError):
+    """Sequence contents is undefined."""
+
+
+class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
+    """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
+
+    Objects of this class can be used to create a Seq object to represent
+    sequences with a known length, but an unknown sequence contents.
+    Calling __len__ returns the sequence length, calling __getitem__ raises a
+    ValueError except for requests of zero size, for which it returns an empty
+    bytes object.
+    """
+
+    __slots__ = ("_length",)
+
+    def __init__(self, length):
+        """Initialize the object with the sequence length."""
+        if length < 0:
+            raise ValueError("Length must not be negative.")
+        self._length = length
+        super().__init__()
+
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            start, end, step = key.indices(self._length)
+            size = len(range(start, end, step))
+            if size == 0:
+                return b""
+            return _UndefinedSequenceData(size)
+        else:
+            raise UndefinedSequenceError("Sequence content is undefined")
+
+    def __len__(self):
+        return self._length
+
+    def __bytes__(self):
+        if self._length == 0:
+            return b""
+        raise UndefinedSequenceError("Sequence content is undefined")
+
+    def __add__(self, other):
+        if isinstance(other, _UndefinedSequenceData):
+            return _UndefinedSequenceData(self._length + other._length)
+        raise TypeError
+
+
+# The transcribe, backward_transcribe, and translate functions are
+# user-friendly versions of the corresponding Seq/MutableSeq methods.
+# The functions work both on Seq objects, and on strings.
+
+
+def transcribe(dna):
+    """Transcribe a DNA sequence into RNA.
+
+    If given a string, returns a new string object.
+
+    Given a Seq or MutableSeq, returns a new Seq object.
+
+    e.g.
+
+    >>> transcribe("ACTGN")
+    'ACUGN'
+    """
+    if isinstance(dna, Seq):
+        return dna.transcribe()
+    elif isinstance(dna, MutableSeq):
+        return Seq(dna).transcribe()
+    else:
+        return dna.replace("T", "U").replace("t", "u")
+
+
+def back_transcribe(rna):
+    """Return the RNA sequence back-transcribed into DNA.
+
+    If given a string, returns a new string object.
+
+    Given a Seq or MutableSeq, returns a new Seq object.
+
+    e.g.
+
+    >>> back_transcribe("ACUGN")
+    'ACTGN'
+    """
+    if isinstance(rna, Seq):
+        return rna.back_transcribe()
+    elif isinstance(rna, MutableSeq):
+        return Seq(rna).back_transcribe()
+    else:
+        return rna.replace("U", "T").replace("u", "t")
+
+
+def _translate_str(
+    sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
+):
+    """Translate nucleotide string into a protein string (PRIVATE).
+
+    Arguments:
+     - sequence - a string
+     - table - Which codon table to use?  This can be either a name (string),
+       an NCBI identifier (integer), or a CodonTable object (useful for
+       non-standard genetic codes).  This defaults to the "Standard" table.
+     - stop_symbol - a single character string, what to use for terminators.
+     - to_stop - boolean, should translation terminate at the first
+       in frame stop codon?  If there is no in-frame stop codon
+       then translation continues to the end.
+     - pos_stop - a single character string for a possible stop codon
+       (e.g. TAN or NNN)
+     - cds - Boolean, indicates this is a complete CDS.  If True, this
+       checks the sequence starts with a valid alternative start
+       codon (which will be translated as methionine, M), that the
+       sequence length is a multiple of three, and that there is a
+       single in frame stop codon at the end (this will be excluded
+       from the protein sequence, regardless of the to_stop option).
+       If these tests fail, an exception is raised.
+     - gap - Single character string to denote symbol used for gaps.
+       Defaults to None.
+
+    Returns a string.
+
+    e.g.
+
+    >>> from Bio.Data import CodonTable
+    >>> table = CodonTable.ambiguous_dna_by_id[1]
+    >>> _translate_str("AAA", table)
+    'K'
+    >>> _translate_str("TAR", table)
+    '*'
+    >>> _translate_str("TAN", table)
+    'X'
+    >>> _translate_str("TAN", table, pos_stop="@")
+    '@'
+    >>> _translate_str("TA?", table)
+    Traceback (most recent call last):
+       ...
+    Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
+
+    In a change to older versions of Biopython, partial codons are now
+    always regarded as an error (previously only checked if cds=True)
+    and will trigger a warning (likely to become an exception in a
+    future release).
+
+    If **cds=True**, the start and stop codons are checked, and the start
+    codon will be translated at methionine. The sequence must be an
+    while number of codons.
+
+    >>> _translate_str("ATGCCCTAG", table, cds=True)
+    'MP'
+    >>> _translate_str("AAACCCTAG", table, cds=True)
+    Traceback (most recent call last):
+       ...
+    Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
+    >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
+    Traceback (most recent call last):
+       ...
+    Bio.Data.CodonTable.TranslationError: Extra in frame stop codon found.
+    """
+    try:
+        table_id = int(table)
+    except ValueError:
+        # Assume it's a table name
+        # The same table can be used for RNA or DNA
+        codon_table = CodonTable.ambiguous_generic_by_name[table]
+
+    except (AttributeError, TypeError):
+        # Assume it's a CodonTable object
+        if isinstance(table, CodonTable.CodonTable):
+            codon_table = table
+        else:
+            raise ValueError("Bad table argument") from None
+    else:
+        # Assume it's a table ID
+        # The same table can be used for RNA or DNA
+        codon_table = CodonTable.ambiguous_generic_by_id[table_id]
+    sequence = sequence.upper()
+    amino_acids = []
+    forward_table = codon_table.forward_table
+    stop_codons = codon_table.stop_codons
+    if codon_table.nucleotide_alphabet is not None:
+        valid_letters = set(codon_table.nucleotide_alphabet.upper())
+    else:
+        # Assume the worst case, ambiguous DNA or RNA:
+        valid_letters = set(
+            IUPACData.ambiguous_dna_letters.upper()
+            + IUPACData.ambiguous_rna_letters.upper()
+        )
+    n = len(sequence)
+
+    # Check for tables with 'ambiguous' (dual-coding) stop codons:
+    dual_coding = [c for c in stop_codons if c in forward_table]
+    if dual_coding:
+        c = dual_coding[0]
+        if to_stop:
+            raise ValueError(
+                "You cannot use 'to_stop=True' with this table as it contains"
+                f" {len(dual_coding)} codon(s) which can be both STOP and an"
+                f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
+            )
+        warnings.warn(
+            f"This table contains {len(dual_coding)} codon(s) which code(s) for"
+            f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
+            " or STOP). Such codons will be translated as amino acid.",
+            BiopythonWarning,
+        )
+
+    if cds:
+        if str(sequence[:3]).upper() not in codon_table.start_codons:
+            raise CodonTable.TranslationError(
+                f"First codon '{sequence[:3]}' is not a start codon"
+            )
+        if n % 3 != 0:
+            raise CodonTable.TranslationError(
+                f"Sequence length {n} is not a multiple of three"
+            )
+        if str(sequence[-3:]).upper() not in stop_codons:
+            raise CodonTable.TranslationError(
+                f"Final codon '{sequence[-3:]}' is not a stop codon"
+            )
+        # Don't translate the stop symbol, and manually translate the M
+        sequence = sequence[3:-3]
+        n -= 6
+        amino_acids = ["M"]
+    elif n % 3 != 0:
+        warnings.warn(
+            "Partial codon, len(sequence) not a multiple of three. "
+            "Explicitly trim the sequence or add trailing N before "
+            "translation. This may become an error in future.",
+            BiopythonWarning,
+        )
+    if gap is not None:
+        if not isinstance(gap, str):
+            raise TypeError("Gap character should be a single character string.")
+        elif len(gap) > 1:
+            raise ValueError("Gap character should be a single character string.")
+
+    for i in range(0, n - n % 3, 3):
+        codon = sequence[i : i + 3]
+        try:
+            amino_acids.append(forward_table[codon])
+        except (KeyError, CodonTable.TranslationError):
+            if codon in codon_table.stop_codons:
+                if cds:
+                    raise CodonTable.TranslationError(
+                        "Extra in frame stop codon found."
+                    ) from None
+                if to_stop:
+                    break
+                amino_acids.append(stop_symbol)
+            elif valid_letters.issuperset(set(codon)):
+                # Possible stop codon (e.g. NNN or TAN)
+                amino_acids.append(pos_stop)
+            elif gap is not None and codon == gap * 3:
+                # Gapped translation
+                amino_acids.append(gap)
+            else:
+                raise CodonTable.TranslationError(
+                    f"Codon '{codon}' is invalid"
+                ) from None
+    return "".join(amino_acids)
+
+
+def translate(
+    sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
+):
+    """Translate a nucleotide sequence into amino acids.
+
+    If given a string, returns a new string object. Given a Seq or
+    MutableSeq, returns a Seq object.
+
+    Arguments:
+     - table - Which codon table to use?  This can be either a name
+       (string), an NCBI identifier (integer), or a CodonTable object
+       (useful for non-standard genetic codes).  Defaults to the "Standard"
+       table.
+     - stop_symbol - Single character string, what to use for any
+       terminators, defaults to the asterisk, "*".
+     - to_stop - Boolean, defaults to False meaning do a full
+       translation continuing on past any stop codons
+       (translated as the specified stop_symbol).  If
+       True, translation is terminated at the first in
+       frame stop codon (and the stop_symbol is not
+       appended to the returned protein sequence).
+     - cds - Boolean, indicates this is a complete CDS.  If True, this
+       checks the sequence starts with a valid alternative start
+       codon (which will be translated as methionine, M), that the
+       sequence length is a multiple of three, and that there is a
+       single in frame stop codon at the end (this will be excluded
+       from the protein sequence, regardless of the to_stop option).
+       If these tests fail, an exception is raised.
+     - gap - Single character string to denote symbol used for gaps.
+       Defaults to None.
+
+    A simple string example using the default (standard) genetic code:
+
+    >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
+    >>> translate(coding_dna)
+    'VAIVMGR*KGAR*'
+    >>> translate(coding_dna, stop_symbol="@")
+    'VAIVMGR@KGAR@'
+    >>> translate(coding_dna, to_stop=True)
+    'VAIVMGR'
+
+    Now using NCBI table 2, where TGA is not a stop codon:
+
+    >>> translate(coding_dna, table=2)
+    'VAIVMGRWKGAR*'
+    >>> translate(coding_dna, table=2, to_stop=True)
+    'VAIVMGRWKGAR'
+
+    In fact this example uses an alternative start codon valid under NCBI
+    table 2, GTG, which means this example is a complete valid CDS which
+    when translated should really start with methionine (not valine):
+
+    >>> translate(coding_dna, table=2, cds=True)
+    'MAIVMGRWKGAR'
+
+    Note that if the sequence has no in-frame stop codon, then the to_stop
+    argument has no effect:
+
+    >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
+    >>> translate(coding_dna2)
+    'VAIVMGR'
+    >>> translate(coding_dna2, to_stop=True)
+    'VAIVMGR'
+
+    NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
+    or a stop codon.  These are translated as "X".  Any invalid codon
+    (e.g. "TA?" or "T-A") will throw a TranslationError.
+
+    It will however translate either DNA or RNA.
+
+    NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
+    stop codons'. These are stop codons with unambiguous sequence but which
+    have a context dependent coding as STOP or as amino acid. With these tables
+    'to_stop' must be False (otherwise a ValueError is raised). The dual
+    coding codons will always be translated as amino acid, except for
+    'cds=True', where the last codon will be translated as STOP.
+
+    >>> coding_dna3 = "ATGGCACGGAAGTGA"
+    >>> translate(coding_dna3)
+    'MARK*'
+
+    >>> translate(coding_dna3, table=27)  # Table 27: TGA -> STOP or W
+    'MARKW'
+
+    It will however raise a BiopythonWarning (not shown).
+
+    >>> translate(coding_dna3, table=27, cds=True)
+    'MARK'
+
+    >>> translate(coding_dna3, table=27, to_stop=True)
+    Traceback (most recent call last):
+       ...
+    ValueError: You cannot use 'to_stop=True' with this table ...
+    """
+    if isinstance(sequence, Seq):
+        return sequence.translate(table, stop_symbol, to_stop, cds)
+    elif isinstance(sequence, MutableSeq):
+        # Return a Seq object
+        return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
+    else:
+        # Assume it's a string, return a string
+        return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
+
+
+def reverse_complement(sequence):
+    """Return the reverse complement sequence of a nucleotide string.
+
+    If given a string, returns a new string object.
+    Given a Seq or a MutableSeq, returns a new Seq object.
+
+    Supports unambiguous and ambiguous nucleotide sequences.
+
+    e.g.
+
+    >>> reverse_complement("ACTG-NH")
+    'DN-CAGT'
+
+    If neither T nor U is present, DNA is assumed and A is mapped to T:
+
+    >>> reverse_complement("A")
+    'T'
+    """
+    return complement(sequence)[::-1]
+
+
+def complement(sequence):
+    """Return the complement sequence of a DNA string.
+
+    If given a string, returns a new string object.
+
+    Given a Seq or a MutableSeq, returns a new Seq object.
+
+    Supports unambiguous and ambiguous nucleotide sequences.
+
+    e.g.
+
+    >>> complement("ACTG-NH")
+    'TGAC-ND'
+
+    If neither T nor U is present, DNA is assumed and A is mapped to T:
+
+    >>> complement("A")
+    'T'
+
+    However, this may not be supported in future. Please use the
+    complement_rna function if you have RNA.
+    """
+    if isinstance(sequence, Seq):
+        # Return a Seq
+        return sequence.complement()
+    elif isinstance(sequence, MutableSeq):
+        # Return a Seq
+        # Don't use the MutableSeq reverse_complement method as it is
+        # 'in place'.
+        return Seq(sequence).complement()
+
+    # Assume it's a string.
+    # In order to avoid some code duplication, the old code would turn the
+    # string into a Seq, use the reverse_complement method, and convert back
+    # to a string.
+    # This worked, but is over five times slower on short sequences!
+    sequence = sequence.encode("ASCII")
+    if (b"U" in sequence or b"u" in sequence) and (
+        b"T" in sequence or b"t" in sequence
+    ):  # ugly but this is what black wants
+        raise ValueError("Mixed RNA/DNA found")
+    elif b"U" in sequence or b"u" in sequence:
+        # TODO - warning or exception in future?
+        ttable = _rna_complement_table
+    else:
+        ttable = _dna_complement_table
+    sequence = sequence.translate(ttable)
+    return sequence.decode("ASCII")
+
+
+def complement_rna(sequence):
+    """Return the complement sequence of an RNA string.
+
+    >>> complement("ACG")  # assumed DNA
+    'TGC'
+    >>> complement_rna("ACG")
+    'UGC'
+
+    Any T in the sequence is treated as a U.
+    """
+    if isinstance(sequence, Seq):
+        # Return a Seq
+        return sequence.complement_rna()
+    elif isinstance(sequence, MutableSeq):
+        # Return a Seq
+        return Seq(sequence).complement_rna()
+    sequence = sequence.encode("ASCII")
+    sequence = sequence.translate(_rna_complement_table)
+    return sequence.decode("ASCII")
+
+
+def _test():
+    """Run the Bio.Seq module's doctests (PRIVATE)."""
+    print("Running doctests...")
+    import doctest
+
+    doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
+    print("Done")
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/code/lib/Bio/SeqFeature.py b/code/lib/Bio/SeqFeature.py
new file mode 100644
index 0000000..625154a
--- /dev/null
+++ b/code/lib/Bio/SeqFeature.py
@@ -0,0 +1,2224 @@
+# Copyright 2000-2003 Jeff Chang.
+# Copyright 2001-2008 Brad Chapman.
+# Copyright 2005-2016 by Peter Cock.
+# Copyright 2006-2009 Michiel de Hoon.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Represent a Sequence Feature holding info about a part of a sequence.
+
+This is heavily modeled after the Biocorba SeqFeature objects, and
+may be pretty biased towards GenBank stuff since I'm writing it
+for the GenBank parser output...
+
+What's here:
+
+Base class to hold a Feature
+----------------------------
+
+Classes:
+ - SeqFeature
+
+Hold information about a Reference
+----------------------------------
+
+This is an attempt to create a General class to hold Reference type
+information.
+
+Classes:
+ - Reference
+
+Specify locations of a feature on a Sequence
+--------------------------------------------
+
+This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'.
+This has the advantages of allowing us to handle fuzzy stuff in case anyone
+needs it, and also be compatible with BioPerl etc and BioSQL.
+
+Classes:
+ - FeatureLocation - Specify the start and end location of a feature.
+ - CompoundLocation - Collection of FeatureLocation objects (for joins etc).
+ - ExactPosition - Specify the position as being exact.
+ - WithinPosition - Specify a position occurring within some range.
+ - BetweenPosition - Specify a position occurring between a range (OBSOLETE?).
+ - BeforePosition - Specify the position as being found before some base.
+ - AfterPosition - Specify the position as being found after some base.
+ - OneOfPosition - Specify a position where the location can be multiple positions.
+ - UncertainPosition - Specify a specific position which is uncertain.
+ - UnknownPosition - Represents missing information like '?' in UniProt.
+
+"""
+import functools
+
+from collections import OrderedDict
+
+from Bio.Seq import MutableSeq
+from Bio.Seq import reverse_complement
+from Bio.Seq import Seq
+
+
+class SeqFeature:
+    """Represent a Sequence Feature on an object.
+
+    Attributes:
+     - location - the location of the feature on the sequence (FeatureLocation)
+     - type - the specified type of the feature (ie. CDS, exon, repeat...)
+     - location_operator - a string specifying how this SeqFeature may
+       be related to others. For example, in the example GenBank feature
+       shown below, the location_operator would be "join". This is a proxy
+       for feature.location.operator and only applies to compound locations.
+     - strand - A value specifying on which strand (of a DNA sequence, for
+       instance) the feature deals with. 1 indicates the plus strand, -1
+       indicates the minus strand, 0 indicates stranded but unknown (? in GFF3),
+       while the default of None indicates that strand doesn't apply (dot in GFF3,
+       e.g. features on proteins). Note this is a shortcut for accessing the
+       strand property of the feature's location.
+     - id - A string identifier for the feature.
+     - ref - A reference to another sequence. This could be an accession
+       number for some different sequence. Note this is a shortcut for the
+       reference property of the feature's location.
+     - ref_db - A different database for the reference accession number.
+       Note this is a shortcut for the reference property of the location
+     - qualifiers - A dictionary of qualifiers on the feature. These are
+       analogous to the qualifiers from a GenBank feature table. The keys of
+       the dictionary are qualifier names, the values are the qualifier
+       values. As of Biopython 1.69 this is an ordered dictionary.
+
+    """
+
+    def __init__(
+        self,
+        location=None,
+        type="",
+        location_operator="",
+        strand=None,
+        id="",
+        qualifiers=None,
+        sub_features=None,
+        ref=None,
+        ref_db=None,
+    ):
+        """Initialize a SeqFeature on a Sequence.
+
+        location can either be a FeatureLocation (with strand argument also
+        given if required), or None.
+
+        e.g. With no strand, on the forward strand, and on the reverse strand:
+
+        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+        >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain")
+        >>> f1.strand == f1.location.strand == None
+        True
+        >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS")
+        >>> f2.strand == f2.location.strand == +1
+        True
+        >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS")
+        >>> f3.strand == f3.location.strand == -1
+        True
+
+        An invalid strand will trigger an exception:
+
+        >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2)
+        Traceback (most recent call last):
+           ...
+        ValueError: Strand should be +1, -1, 0 or None, not 2
+
+        Similarly if set via the FeatureLocation directly:
+
+        >>> loc4 = FeatureLocation(50, 60, strand=2)
+        Traceback (most recent call last):
+           ...
+        ValueError: Strand should be +1, -1, 0 or None, not 2
+
+        For exact start/end positions, an integer can be used (as shown above)
+        as shorthand for the ExactPosition object. For non-exact locations, the
+        FeatureLocation must be specified via the appropriate position objects.
+
+        Note that the strand, ref and ref_db arguments to the SeqFeature are
+        now obsolete and will be deprecated in a future release (which will
+        give warning messages) and later removed. Set them via the location
+        object instead.
+
+        Note that location_operator and sub_features arguments can no longer
+        be used, instead do this via the CompoundLocation object.
+        """
+        if (
+            location is not None
+            and not isinstance(location, FeatureLocation)
+            and not isinstance(location, CompoundLocation)
+        ):
+            raise TypeError(
+                "FeatureLocation, CompoundLocation (or None) required for the location"
+            )
+        self.location = location
+        self.type = type
+        if location_operator:
+            # TODO - Deprecation warning
+            self.location_operator = location_operator
+        if strand is not None:
+            # TODO - Deprecation warning
+            self.strand = strand
+        self.id = id
+        if qualifiers is None:
+            qualifiers = OrderedDict()
+        self.qualifiers = qualifiers
+        if sub_features is not None:
+            raise TypeError("Rather than sub_features, use a CompoundFeatureLocation")
+        if ref is not None:
+            # TODO - Deprecation warning
+            self.ref = ref
+        if ref_db is not None:
+            # TODO - Deprecation warning
+            self.ref_db = ref_db
+
+    def _get_strand(self):
+        """Get function for the strand property (PRIVATE)."""
+        return self.location.strand
+
+    def _set_strand(self, value):
+        """Set function for the strand property (PRIVATE)."""
+        try:
+            self.location.strand = value
+        except AttributeError:
+            if self.location is None:
+                if value is not None:
+                    raise ValueError("Can't set strand without a location.") from None
+            else:
+                raise
+
+    strand = property(
+        fget=_get_strand,
+        fset=_set_strand,
+        doc="""Feature's strand
+
+                          This is a shortcut for feature.location.strand
+                          """,
+    )
+
+    def _get_ref(self):
+        """Get function for the reference property (PRIVATE)."""
+        try:
+            return self.location.ref
+        except AttributeError:
+            return None
+
+    def _set_ref(self, value):
+        """Set function for the reference property (PRIVATE)."""
+        try:
+            self.location.ref = value
+        except AttributeError:
+            if self.location is None:
+                if value is not None:
+                    raise ValueError("Can't set ref without a location.") from None
+            else:
+                raise
+
+    ref = property(
+        fget=_get_ref,
+        fset=_set_ref,
+        doc="""Feature location reference (e.g. accession).
+
+                       This is a shortcut for feature.location.ref
+                       """,
+    )
+
+    def _get_ref_db(self):
+        """Get function for the database reference property (PRIVATE)."""
+        try:
+            return self.location.ref_db
+        except AttributeError:
+            return None
+
+    def _set_ref_db(self, value):
+        """Set function for the database reference property (PRIVATE)."""
+        self.location.ref_db = value
+
+    ref_db = property(
+        fget=_get_ref_db,
+        fset=_set_ref_db,
+        doc="""Feature location reference's database.
+
+                          This is a shortcut for feature.location.ref_db
+                          """,
+    )
+
+    def _get_location_operator(self):
+        """Get function for the location operator property (PRIVATE)."""
+        try:
+            return self.location.operator
+        except AttributeError:
+            return None
+
+    def _set_location_operator(self, value):
+        """Set function for the location operator property (PRIVATE)."""
+        if value:
+            if isinstance(self.location, CompoundLocation):
+                self.location.operator = value
+            elif self.location is None:
+                raise ValueError(
+                    "Location is None so can't set its operator (to %r)" % value
+                )
+            else:
+                raise ValueError("Only CompoundLocation gets an operator (%r)" % value)
+
+    location_operator = property(
+        fget=_get_location_operator,
+        fset=_set_location_operator,
+        doc="Location operator for compound locations (e.g. join).",
+    )
+
+    def __repr__(self):
+        """Represent the feature as a string for debugging."""
+        answer = "%s(%r" % (self.__class__.__name__, self.location)
+        if self.type:
+            answer += ", type=%r" % self.type
+        if self.location_operator:
+            answer += ", location_operator=%r" % self.location_operator
+        if self.id and self.id != "":
+            answer += ", id=%r" % self.id
+        if self.ref:
+            answer += ", ref=%r" % self.ref
+        if self.ref_db:
+            answer += ", ref_db=%r" % self.ref_db
+        answer += ")"
+        return answer
+
+    def __str__(self):
+        """Return the full feature as a python string."""
+        out = "type: %s\n" % self.type
+        out += "location: %s\n" % self.location
+        if self.id and self.id != "":
+            out += "id: %s\n" % self.id
+        out += "qualifiers:\n"
+        for qual_key in sorted(self.qualifiers):
+            out += "    Key: %s, Value: %s\n" % (qual_key, self.qualifiers[qual_key])
+        return out
+
+    def _shift(self, offset):
+        """Return a copy of the feature with its location shifted (PRIVATE).
+
+        The annotation qaulifiers are copied.
+        """
+        return SeqFeature(
+            location=self.location._shift(offset),
+            type=self.type,
+            location_operator=self.location_operator,
+            id=self.id,
+            qualifiers=OrderedDict(self.qualifiers.items()),
+        )
+
+    def _flip(self, length):
+        """Return a copy of the feature with its location flipped (PRIVATE).
+
+        The argument length gives the length of the parent sequence. For
+        example a location 0..20 (+1 strand) with parent length 30 becomes
+        after flipping 10..30 (-1 strand). Strandless (None) or unknown
+        strand (0) remain like that - just their end points are changed.
+
+        The annotation qaulifiers are copied.
+        """
+        return SeqFeature(
+            location=self.location._flip(length),
+            type=self.type,
+            location_operator=self.location_operator,
+            id=self.id,
+            qualifiers=OrderedDict(self.qualifiers.items()),
+        )
+
+    def extract(self, parent_sequence, references=None):
+        """Extract the feature's sequence from supplied parent sequence.
+
+        The parent_sequence can be a Seq like object or a string, and will
+        generally return an object of the same type. The exception to this is
+        a MutableSeq as the parent sequence will return a Seq object.
+
+        This should cope with complex locations including complements, joins
+        and fuzzy positions. Even mixed strand features should work! This
+        also covers features on protein sequences (e.g. domains), although
+        here reverse strand features are not permitted. If the
+        location refers to other records, they must be supplied in the
+        optional dictionary references.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
+        >>> f = SeqFeature(FeatureLocation(8, 15), type="domain")
+        >>> f.extract(seq)
+        Seq('VALIVIC')
+
+        If the FeatureLocation is None, e.g. when parsing invalid locus
+        locations in the GenBank parser, extract() will raise a ValueError.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqFeature import SeqFeature
+        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
+        >>> f = SeqFeature(None, type="domain")
+        >>> f.extract(seq)
+        Traceback (most recent call last):
+           ...
+        ValueError: The feature's .location is None. Check the sequence file for a valid location.
+
+        Note - currently only compound features of type "join" are supported.
+        """
+        if self.location is None:
+            raise ValueError(
+                "The feature's .location is None. Check the "
+                "sequence file for a valid location."
+            )
+        return self.location.extract(parent_sequence, references=references)
+
+    def translate(
+        self,
+        parent_sequence,
+        table="Standard",
+        start_offset=None,
+        stop_symbol="*",
+        to_stop=False,
+        cds=None,
+        gap=None,
+    ):
+        """Get a translation of the feature's sequence.
+
+        This method is intended for CDS or other features that code proteins
+        and is a shortcut that will both extract the feature and
+        translate it, taking into account the codon_start and transl_table
+        qualifiers, if they are present. If they are not present the
+        value of the arguments "table" and "start_offset" are used.
+
+        The "cds" parameter is set to "True" if the feature is of type
+        "CDS" but can be overridden by giving an explicit argument.
+
+        The arguments stop_symbol, to_stop and gap have the same meaning
+        as Seq.translate, refer to that documentation for further information.
+
+        Arguments:
+         - parent_sequence - A DNA or RNA sequence.
+         - table - Which codon table to use if there is no transl_table
+           qualifier for this feature. This can be either a name
+           (string), an NCBI identifier (integer), or a CodonTable
+           object (useful for non-standard genetic codes).  This
+           defaults to the "Standard" table.
+         - start_offset - offset at which the first complete codon of a
+           coding feature can be found, relative to the first base of
+           that feature. Has a valid value of 0, 1 or 2. NOTE: this
+           uses python's 0-based numbering whereas the codon_start
+           qualifier in files from NCBI use 1-based numbering.
+           Will override a codon_start qualifier
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+        >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA")
+        >>> f = SeqFeature(FeatureLocation(0, 30), type="CDS")
+        >>> f.qualifiers['transl_table'] = [11]
+
+        Note that features of type CDS are subject to the usual
+        checks at translation. But you can override this behaviour
+        by giving explicit arguments:
+
+        >>> f.translate(seq, cds=False)
+        Seq('GYTYR*CL**')
+
+        Now use the start_offset argument to change the frame. Note
+        this uses python 0-based numbering.
+
+        >>> f.translate(seq, start_offset=1, cds=False)
+        Seq('VTLTDNVSD')
+
+        Alternatively use the codon_start qualifier to do the same
+        thing. Note: this uses 1-based numbering, which is found
+        in files from NCBI.
+
+        >>> f.qualifiers['codon_start'] = [2]
+        >>> f.translate(seq, cds=False)
+        Seq('VTLTDNVSD')
+        """
+        # see if this feature should be translated in a different
+        # frame using the "codon_start" qualifier
+        if start_offset is None:
+            try:
+                start_offset = int(self.qualifiers["codon_start"][0]) - 1
+            except KeyError:
+                start_offset = 0
+
+        if start_offset not in [0, 1, 2]:
+            raise ValueError(
+                "The start_offset must be 0, 1, or 2. "
+                f"The supplied value is {start_offset}. "
+                "Check the value of either the codon_start qualifier "
+                "or the start_offset argument"
+            )
+
+        feat_seq = self.extract(parent_sequence)[start_offset:]
+        codon_table = self.qualifiers.get("transl_table", [table])[0]
+
+        if cds is None:
+            cds = self.type == "CDS"
+
+        return feat_seq.translate(
+            table=codon_table,
+            stop_symbol=stop_symbol,
+            to_stop=to_stop,
+            cds=cds,
+            gap=gap,
+        )
+
+    def __bool__(self):
+        """Boolean value of an instance of this class (True).
+
+        This behaviour is for backwards compatibility, since until the
+        __len__ method was added, a SeqFeature always evaluated as True.
+
+        Note that in comparison, Seq objects, strings, lists, etc, will all
+        evaluate to False if they have length zero.
+
+        WARNING: The SeqFeature may in future evaluate to False when its
+        length is zero (in order to better match normal python behaviour)!
+        """
+        return True
+
+    def __len__(self):
+        """Return the length of the region where the feature is located.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
+        >>> f = SeqFeature(FeatureLocation(8, 15), type="domain")
+        >>> len(f)
+        7
+        >>> f.extract(seq)
+        Seq('VALIVIC')
+        >>> len(f.extract(seq))
+        7
+
+        This is a proxy for taking the length of the feature's location:
+
+        >>> len(f.location)
+        7
+
+        For simple features this is the same as the region spanned (end
+        position minus start position using Pythonic counting). However, for
+        a compound location (e.g. a CDS as the join of several exons) the
+        gaps are not counted (e.g. introns). This ensures that len(f) matches
+        len(f.extract(parent_seq)), and also makes sure things work properly
+        with features wrapping the origin etc.
+        """
+        return len(self.location)
+
+    def __iter__(self):
+        """Iterate over the parent positions within the feature.
+
+        The iteration order is strand aware, and can be thought of as moving
+        along the feature using the parent sequence coordinates:
+
+        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+        >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1)
+        >>> len(f)
+        5
+        >>> for i in f: print(i)
+        9
+        8
+        7
+        6
+        5
+        >>> list(f)
+        [9, 8, 7, 6, 5]
+
+        This is a proxy for iterating over the location,
+
+        >>> list(f.location)
+        [9, 8, 7, 6, 5]
+        """
+        return iter(self.location)
+
+    def __contains__(self, value):
+        """Check if an integer position is within the feature.
+
+        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+        >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1)
+        >>> len(f)
+        5
+        >>> [i for i in range(15) if i in f]
+        [5, 6, 7, 8, 9]
+
+        For example, to see which features include a SNP position, you could
+        use this:
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb")
+        >>> for f in record.features:
+        ...     if 1750 in f:
+        ...         print("%s %s" % (f.type, f.location))
+        source [0:154478](+)
+        gene [1716:4347](-)
+        tRNA join{[4310:4347](-), [1716:1751](-)}
+
+        Note that for a feature defined as a join of several subfeatures (e.g.
+        the union of several exons) the gaps are not checked (e.g. introns).
+        In this example, the tRNA location is defined in the GenBank file as
+        complement(join(1717..1751,4311..4347)), so that position 1760 falls
+        in the gap:
+
+        >>> for f in record.features:
+        ...     if 1760 in f:
+        ...         print("%s %s" % (f.type, f.location))
+        source [0:154478](+)
+        gene [1716:4347](-)
+
+        Note that additional care may be required with fuzzy locations, for
+        example just before a BeforePosition:
+
+        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+        >>> from Bio.SeqFeature import BeforePosition
+        >>> f = SeqFeature(FeatureLocation(BeforePosition(3), 8), type="domain")
+        >>> len(f)
+        5
+        >>> [i for i in range(10) if i in f]
+        [3, 4, 5, 6, 7]
+
+        Note that is is a proxy for testing membership on the location.
+
+        >>> [i for i in range(10) if i in f.location]
+        [3, 4, 5, 6, 7]
+        """
+        return value in self.location
+
+
+# --- References
+
+
+# TODO -- Will this hold PubMed and Medline information decently?
+class Reference:
+    """Represent a Generic Reference object.
+
+    Attributes:
+     - location - A list of Location objects specifying regions of
+       the sequence that the references correspond to. If no locations are
+       specified, the entire sequence is assumed.
+     - authors - A big old string, or a list split by author, of authors
+       for the reference.
+     - title - The title of the reference.
+     - journal - Journal the reference was published in.
+     - medline_id - A medline reference for the article.
+     - pubmed_id - A pubmed reference for the article.
+     - comment - A place to stick any comments about the reference.
+
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.location = []
+        self.authors = ""
+        self.consrtm = ""
+        self.title = ""
+        self.journal = ""
+        self.medline_id = ""
+        self.pubmed_id = ""
+        self.comment = ""
+
+    def __str__(self):
+        """Return the full Reference object as a python string."""
+        out = ""
+        for single_location in self.location:
+            out += "location: %s\n" % single_location
+        out += "authors: %s\n" % self.authors
+        if self.consrtm:
+            out += "consrtm: %s\n" % self.consrtm
+        out += "title: %s\n" % self.title
+        out += "journal: %s\n" % self.journal
+        out += "medline id: %s\n" % self.medline_id
+        out += "pubmed id: %s\n" % self.pubmed_id
+        out += "comment: %s\n" % self.comment
+        return out
+
+    def __repr__(self):
+        """Represent the Reference object as a string for debugging."""
+        # TODO - Update this is __init__ later accepts values
+        return "%s(title=%r, ...)" % (self.__class__.__name__, self.title)
+
+    def __eq__(self, other):
+        """Check if two Reference objects should be considered equal.
+
+        Note prior to Biopython 1.70 the location was not compared, as
+        until then __eq__ for the FeatureLocation class was not defined.
+        """
+        return (
+            self.authors == other.authors
+            and self.consrtm == other.consrtm
+            and self.title == other.title
+            and self.journal == other.journal
+            and self.medline_id == other.medline_id
+            and self.pubmed_id == other.pubmed_id
+            and self.comment == other.comment
+            and self.location == other.location
+        )
+
+
+# --- Handling feature locations
+
+
+class FeatureLocation:
+    """Specify the location of a feature along a sequence.
+
+    The FeatureLocation is used for simple continuous features, which can
+    be described as running from a start position to and end position
+    (optionally with a strand and reference information).  More complex
+    locations made up from several non-continuous parts (e.g. a coding
+    sequence made up of several exons) are described using a SeqFeature
+    with a CompoundLocation.
+
+    Note that the start and end location numbering follow Python's scheme,
+    thus a GenBank entry of 123..150 (one based counting) becomes a location
+    of [122:150] (zero based counting).
+
+    >>> from Bio.SeqFeature import FeatureLocation
+    >>> f = FeatureLocation(122, 150)
+    >>> print(f)
+    [122:150]
+    >>> print(f.start)
+    122
+    >>> print(f.end)
+    150
+    >>> print(f.strand)
+    None
+
+    Note the strand defaults to None. If you are working with nucleotide
+    sequences you'd want to be explicit if it is the forward strand:
+
+    >>> from Bio.SeqFeature import FeatureLocation
+    >>> f = FeatureLocation(122, 150, strand=+1)
+    >>> print(f)
+    [122:150](+)
+    >>> print(f.strand)
+    1
+
+    Note that for a parent sequence of length n, the FeatureLocation
+    start and end must satisfy the inequality 0 <= start <= end <= n.
+    This means even for features on the reverse strand of a nucleotide
+    sequence, we expect the 'start' coordinate to be less than the
+    'end'.
+
+    >>> from Bio.SeqFeature import FeatureLocation
+    >>> r = FeatureLocation(122, 150, strand=-1)
+    >>> print(r)
+    [122:150](-)
+    >>> print(r.start)
+    122
+    >>> print(r.end)
+    150
+    >>> print(r.strand)
+    -1
+
+    i.e. Rather than thinking of the 'start' and 'end' biologically in a
+    strand aware manner, think of them as the 'left most' or 'minimum'
+    boundary, and the 'right most' or 'maximum' boundary of the region
+    being described. This is particularly important with compound
+    locations describing non-continuous regions.
+
+    In the example above we have used standard exact positions, but there
+    are also specialised position objects used to represent fuzzy positions
+    as well, for example a GenBank location like complement(<123..150)
+    would use a BeforePosition object for the start.
+    """
+
+    def __init__(self, start, end, strand=None, ref=None, ref_db=None):
+        """Initialize the class.
+
+        start and end arguments specify the values where the feature begins
+        and ends. These can either by any of the ``*Position`` objects that
+        inherit from AbstractPosition, or can just be integers specifying the
+        position. In the case of integers, the values are assumed to be
+        exact and are converted in ExactPosition arguments. This is meant
+        to make it easy to deal with non-fuzzy ends.
+
+        i.e. Short form:
+
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> loc = FeatureLocation(5, 10, strand=-1)
+        >>> print(loc)
+        [5:10](-)
+
+        Explicit form:
+
+        >>> from Bio.SeqFeature import FeatureLocation, ExactPosition
+        >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1)
+        >>> print(loc)
+        [5:10](-)
+
+        Other fuzzy positions are used similarly,
+
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> from Bio.SeqFeature import BeforePosition, AfterPosition
+        >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1)
+        >>> print(loc2)
+        [<5:>10](-)
+
+        For nucleotide features you will also want to specify the strand,
+        use 1 for the forward (plus) strand, -1 for the reverse (negative)
+        strand, 0 for stranded but strand unknown (? in GFF3), or None for
+        when the strand does not apply (dot in GFF3), e.g. features on
+        proteins.
+
+        >>> loc = FeatureLocation(5, 10, strand=+1)
+        >>> print(loc)
+        [5:10](+)
+        >>> print(loc.strand)
+        1
+
+        Normally feature locations are given relative to the parent
+        sequence you are working with, but an explicit accession can
+        be given with the optional ref and db_ref strings:
+
+        >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1)
+        >>> print(loc)
+        AL391218.9[105172:108462](+)
+        >>> print(loc.ref)
+        AL391218.9
+
+        """
+        # TODO - Check 0 <= start <= end (<= length of reference)
+        if isinstance(start, AbstractPosition):
+            self._start = start
+        elif isinstance(start, int):
+            self._start = ExactPosition(start)
+        else:
+            raise TypeError("start=%r %s" % (start, type(start)))
+        if isinstance(end, AbstractPosition):
+            self._end = end
+        elif isinstance(end, int):
+            self._end = ExactPosition(end)
+        else:
+            raise TypeError("end=%r %s" % (end, type(end)))
+        if (
+            isinstance(self.start.position, int)
+            and isinstance(self.end.position, int)
+            and self.start > self.end
+        ):
+            raise ValueError(
+                f"End location ({self.end}) must be greater than "
+                f"or equal to start location ({self.start})"
+            )
+        self.strand = strand
+        self.ref = ref
+        self.ref_db = ref_db
+
+    def _get_strand(self):
+        """Get function for the strand property (PRIVATE)."""
+        return self._strand
+
+    def _set_strand(self, value):
+        """Set function for the strand property (PRIVATE)."""
+        if value not in [+1, -1, 0, None]:
+            raise ValueError("Strand should be +1, -1, 0 or None, not %r" % value)
+        self._strand = value
+
+    strand = property(
+        fget=_get_strand,
+        fset=_set_strand,
+        doc="Strand of the location (+1, -1, 0 or None).",
+    )
+
+    def __str__(self):
+        """Return a representation of the FeatureLocation object (with python counting).
+
+        For the simple case this uses the python splicing syntax, [122:150]
+        (zero based counting) which GenBank would call 123..150 (one based
+        counting).
+        """
+        answer = "[%s:%s]" % (self._start, self._end)
+        if self.ref and self.ref_db:
+            answer = "%s:%s%s" % (self.ref_db, self.ref, answer)
+        elif self.ref:
+            answer = self.ref + answer
+        # Is ref_db without ref meaningful?
+        if self.strand is None:
+            return answer
+        elif self.strand == +1:
+            return answer + "(+)"
+        elif self.strand == -1:
+            return answer + "(-)"
+        else:
+            # strand = 0, stranded but strand unknown, ? in GFF3
+            return answer + "(?)"
+
+    def __repr__(self):
+        """Represent the FeatureLocation object as a string for debugging."""
+        optional = ""
+        if self.strand is not None:
+            optional += ", strand=%r" % self.strand
+        if self.ref is not None:
+            optional += ", ref=%r" % self.ref
+        if self.ref_db is not None:
+            optional += ", ref_db=%r" % self.ref_db
+        return "%s(%r, %r%s)" % (
+            self.__class__.__name__,
+            self.start,
+            self.end,
+            optional,
+        )
+
+    def __add__(self, other):
+        """Combine location with another FeatureLocation object, or shift it.
+
+        You can add two feature locations to make a join CompoundLocation:
+
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> f1 = FeatureLocation(5, 10)
+        >>> f2 = FeatureLocation(20, 30)
+        >>> combined = f1 + f2
+        >>> print(combined)
+        join{[5:10], [20:30]}
+
+        This is thus equivalent to:
+
+        >>> from Bio.SeqFeature import CompoundLocation
+        >>> join = CompoundLocation([f1, f2])
+        >>> print(join)
+        join{[5:10], [20:30]}
+
+        You can also use sum(...) in this way:
+
+        >>> join = sum([f1, f2])
+        >>> print(join)
+        join{[5:10], [20:30]}
+
+        Furthermore, you can combine a FeatureLocation with a CompoundLocation
+        in this way.
+
+        Separately, adding an integer will give a new FeatureLocation with
+        its start and end offset by that amount. For example:
+
+        >>> print(f1)
+        [5:10]
+        >>> print(f1 + 100)
+        [105:110]
+        >>> print(200 + f1)
+        [205:210]
+
+        This can be useful when editing annotation.
+        """
+        if isinstance(other, FeatureLocation):
+            return CompoundLocation([self, other])
+        elif isinstance(other, int):
+            return self._shift(other)
+        else:
+            # This will allow CompoundLocation's __radd__ to be called:
+            return NotImplemented
+
+    def __radd__(self, other):
+        """Add a feature locationanother FeatureLocation object to the left."""
+        if isinstance(other, int):
+            return self._shift(other)
+        else:
+            return NotImplemented
+
+    def __nonzero__(self):
+        """Return True regardless of the length of the feature.
+
+        This behaviour is for backwards compatibility, since until the
+        __len__ method was added, a FeatureLocation always evaluated as True.
+
+        Note that in comparison, Seq objects, strings, lists, etc, will all
+        evaluate to False if they have length zero.
+
+        WARNING: The FeatureLocation may in future evaluate to False when its
+        length is zero (in order to better match normal python behaviour)!
+        """
+        return True
+
+    def __len__(self):
+        """Return the length of the region described by the FeatureLocation object.
+
+        Note that extra care may be needed for fuzzy locations, e.g.
+
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> from Bio.SeqFeature import BeforePosition, AfterPosition
+        >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10))
+        >>> len(loc)
+        5
+        """
+        return int(self._end) - int(self._start)
+
+    def __contains__(self, value):
+        """Check if an integer position is within the FeatureLocation object.
+
+        Note that extra care may be needed for fuzzy locations, e.g.
+
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> from Bio.SeqFeature import BeforePosition, AfterPosition
+        >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10))
+        >>> len(loc)
+        5
+        >>> [i for i in range(15) if i in loc]
+        [5, 6, 7, 8, 9]
+        """
+        if not isinstance(value, int):
+            raise ValueError(
+                "Currently we only support checking for integer "
+                "positions being within a FeatureLocation."
+            )
+        if value < self._start or value >= self._end:
+            return False
+        else:
+            return True
+
+    def __iter__(self):
+        """Iterate over the parent positions within the FeatureLocation object.
+
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> from Bio.SeqFeature import BeforePosition, AfterPosition
+        >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10))
+        >>> len(loc)
+        5
+        >>> for i in loc: print(i)
+        5
+        6
+        7
+        8
+        9
+        >>> list(loc)
+        [5, 6, 7, 8, 9]
+        >>> [i for i in range(15) if i in loc]
+        [5, 6, 7, 8, 9]
+
+        Note this is strand aware:
+
+        >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1)
+        >>> list(loc)
+        [9, 8, 7, 6, 5]
+        """
+        if self.strand == -1:
+            yield from range(self._end - 1, self._start - 1, -1)
+        else:
+            yield from range(self._start, self._end)
+
+    def __eq__(self, other):
+        """Implement equality by comparing all the location attributes."""
+        if not isinstance(other, FeatureLocation):
+            return False
+        return (
+            self._start == other.start
+            and self._end == other.end
+            and self._strand == other.strand
+            and self.ref == other.ref
+            and self.ref_db == other.ref_db
+        )
+
+    def _shift(self, offset):
+        """Return a copy of the FeatureLocation shifted by an offset (PRIVATE).
+
+        Returns self when location is relative to an external reference.
+        """
+        # TODO - What if offset is a fuzzy position?
+        if self.ref or self.ref_db:
+            return self
+        return FeatureLocation(
+            start=self._start._shift(offset),
+            end=self._end._shift(offset),
+            strand=self.strand,
+        )
+
+    def _flip(self, length):
+        """Return a copy of the location after the parent is reversed (PRIVATE).
+
+        Returns self when location is relative to an external reference.
+        """
+        if self.ref or self.ref_db:
+            return self
+        # Note this will flip the start and end too!
+        if self.strand == +1:
+            flip_strand = -1
+        elif self.strand == -1:
+            flip_strand = +1
+        else:
+            # 0 or None
+            flip_strand = self.strand
+        return FeatureLocation(
+            start=self._end._flip(length),
+            end=self._start._flip(length),
+            strand=flip_strand,
+        )
+
+    @property
+    def parts(self):
+        """Read only list of sections (always one, the FeatureLocation object).
+
+        This is a convenience property allowing you to write code handling
+        both simple FeatureLocation objects (with one part) and more complex
+        CompoundLocation objects (with multiple parts) interchangeably.
+        """
+        return [self]
+
+    @property
+    def start(self):
+        """Start location - left most (minimum) value, regardless of strand.
+
+        Read only, returns an integer like position object, possibly a fuzzy
+        position.
+        """
+        return self._start
+
+    @property
+    def end(self):
+        """End location - right most (maximum) value, regardless of strand.
+
+        Read only, returns an integer like position object, possibly a fuzzy
+        position.
+        """
+        return self._end
+
+    @property
+    def nofuzzy_start(self):
+        """Start position (integer, approximated if fuzzy, read only) (OBSOLETE).
+
+        This is now an alias for int(feature.start), which should be
+        used in preference -- unless you are trying to support old
+        versions of Biopython.
+        """
+        try:
+            return int(self._start)
+        except TypeError:
+            if isinstance(self._start, UnknownPosition):
+                return None
+            raise
+
+    @property
+    def nofuzzy_end(self):
+        """End position (integer, approximated if fuzzy, read only) (OBSOLETE).
+
+        This is now an alias for int(feature.end), which should be
+        used in preference -- unless you are trying to support old
+        versions of Biopython.
+        """
+        try:
+            return int(self._end)
+        except TypeError:
+            if isinstance(self._end, UnknownPosition):
+                return None
+            raise
+
+    def extract(self, parent_sequence, references=None):
+        """Extract the sequence from supplied parent sequence using the FeatureLocation object.
+
+        The parent_sequence can be a Seq like object or a string, and will
+        generally return an object of the same type. The exception to this is
+        a MutableSeq as the parent sequence will return a Seq object.
+        If the location refers to other records, they must be supplied
+        in the optional dictionary references.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
+        >>> feature_loc = FeatureLocation(8, 15)
+        >>> feature_loc.extract(seq)
+        Seq('VALIVIC')
+
+        """
+        if self.ref or self.ref_db:
+            if not references:
+                raise ValueError(
+                    f"Feature references another sequence ({self.ref}),"
+                    " references mandatory"
+                )
+            elif self.ref not in references:
+                # KeyError?
+                raise ValueError(
+                    f"Feature references another sequence ({self.ref}),"
+                    " not found in references"
+                )
+            parent_sequence = references[self.ref]
+            try:
+                # If was a SeqRecord, just take the sequence
+                # (should focus on the annotation of the feature)
+                parent_sequence = parent_sequence.seq
+            except AttributeError:
+                pass
+        if isinstance(parent_sequence, MutableSeq):
+            # This avoids complications with reverse complements
+            # (the MutableSeq reverse complement acts in situ)
+            parent_sequence = Seq(parent_sequence)
+        f_seq = parent_sequence[self.nofuzzy_start : self.nofuzzy_end]
+        if self.strand == -1:
+            try:
+                f_seq = f_seq.reverse_complement()
+            except AttributeError:
+                assert isinstance(f_seq, str)
+                f_seq = reverse_complement(f_seq)
+        return f_seq
+
+
+class CompoundLocation:
+    """For handling joins etc where a feature location has several parts."""
+
+    def __init__(self, parts, operator="join"):
+        """Initialize the class.
+
+        >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation
+        >>> f1 = FeatureLocation(10, 40, strand=+1)
+        >>> f2 = FeatureLocation(50, 59, strand=+1)
+        >>> f = CompoundLocation([f1, f2])
+        >>> len(f) == len(f1) + len(f2) == 39 == len(list(f))
+        True
+        >>> print(f.operator)
+        join
+        >>> 5 in f
+        False
+        >>> 15 in f
+        True
+        >>> f.strand
+        1
+
+        Notice that the strand of the compound location is computed
+        automatically - in the case of mixed strands on the sub-locations
+        the overall strand is set to None.
+
+        >>> f = CompoundLocation([FeatureLocation(3, 6, strand=+1),
+        ...                       FeatureLocation(10, 13, strand=-1)])
+        >>> print(f.strand)
+        None
+        >>> len(f)
+        6
+        >>> list(f)
+        [3, 4, 5, 12, 11, 10]
+
+        The example above doing list(f) iterates over the coordinates within the
+        feature. This allows you to use max and min on the location, to find the
+        range covered:
+
+        >>> min(f)
+        3
+        >>> max(f)
+        12
+
+        More generally, you can use the compound location's start and end which
+        give the full span covered, 0 <= start <= end <= full sequence length.
+
+        >>> f.start == min(f)
+        True
+        >>> f.end == max(f) + 1
+        True
+
+        This is consistent with the behaviour of the simple FeatureLocation for
+        a single region, where again the 'start' and 'end' do not necessarily
+        give the biological start and end, but rather the 'minimal' and 'maximal'
+        coordinate boundaries.
+
+        Note that adding locations provides a more intuitive method of
+        construction:
+
+        >>> f = FeatureLocation(3, 6, strand=+1) + FeatureLocation(10, 13, strand=-1)
+        >>> len(f)
+        6
+        >>> list(f)
+        [3, 4, 5, 12, 11, 10]
+        """
+        self.operator = operator
+        self.parts = list(parts)
+        for loc in self.parts:
+            if not isinstance(loc, FeatureLocation):
+                raise ValueError(
+                    "CompoundLocation should be given a list of "
+                    "FeatureLocation objects, not %s" % loc.__class__
+                )
+        if len(parts) < 2:
+            raise ValueError(
+                "CompoundLocation should have at least 2 parts, not %r" % parts
+            )
+
+    def __str__(self):
+        """Return a representation of the CompoundLocation object (with python counting)."""
+        return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
+
+    def __repr__(self):
+        """Represent the CompoundLocation object as string for debugging."""
+        return "%s(%r, %r)" % (self.__class__.__name__, self.parts, self.operator)
+
+    def _get_strand(self):
+        """Get function for the strand property (PRIVATE)."""
+        # Historically a join on the reverse strand has been represented
+        # in Biopython with both the parent SeqFeature and its children
+        # (the exons for a CDS) all given a strand of -1.  Likewise, for
+        # a join feature on the forward strand they all have strand +1.
+        # However, we must also consider evil mixed strand examples like
+        # this, join(complement(69611..69724),139856..140087,140625..140650)
+        if len({loc.strand for loc in self.parts}) == 1:
+            return self.parts[0].strand
+        else:
+            return None  # i.e. mixed strands
+
+    def _set_strand(self, value):
+        """Set function for the strand property (PRIVATE)."""
+        # Should this be allowed/encouraged?
+        for loc in self.parts:
+            loc.strand = value
+
+    strand = property(
+        fget=_get_strand,
+        fset=_set_strand,
+        doc="""Overall strand of the compound location.
+
+        If all the parts have the same strand, that is returned. Otherwise
+        for mixed strands, this returns None.
+
+        >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation
+        >>> f1 = FeatureLocation(15, 17, strand=1)
+        >>> f2 = FeatureLocation(20, 30, strand=-1)
+        >>> f = f1 + f2
+        >>> f1.strand
+        1
+        >>> f2.strand
+        -1
+        >>> f.strand
+        >>> f.strand is None
+        True
+
+        If you set the strand of a CompoundLocation, this is applied to
+        all the parts - use with caution:
+
+        >>> f.strand = 1
+        >>> f1.strand
+        1
+        >>> f2.strand
+        1
+        >>> f.strand
+        1
+
+        """,
+    )
+
+    def __add__(self, other):
+        """Combine locations, or shift the location by an integer offset.
+
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> f1 = FeatureLocation(15, 17) + FeatureLocation(20, 30)
+        >>> print(f1)
+        join{[15:17], [20:30]}
+
+        You can add another FeatureLocation:
+
+        >>> print(f1 + FeatureLocation(40, 50))
+        join{[15:17], [20:30], [40:50]}
+        >>> print(FeatureLocation(5, 10) + f1)
+        join{[5:10], [15:17], [20:30]}
+
+        You can also add another CompoundLocation:
+
+        >>> f2 = FeatureLocation(40, 50) + FeatureLocation(60, 70)
+        >>> print(f2)
+        join{[40:50], [60:70]}
+        >>> print(f1 + f2)
+        join{[15:17], [20:30], [40:50], [60:70]}
+
+        Also, as with the FeatureLocation, adding an integer shifts the
+        location's co-ordinates by that offset:
+
+        >>> print(f1 + 100)
+        join{[115:117], [120:130]}
+        >>> print(200 + f1)
+        join{[215:217], [220:230]}
+        >>> print(f1 + (-5))
+        join{[10:12], [15:25]}
+        """
+        if isinstance(other, FeatureLocation):
+            return CompoundLocation(self.parts + [other], self.operator)
+        elif isinstance(other, CompoundLocation):
+            if self.operator != other.operator:
+                # Handle join+order -> order as a special case?
+                raise ValueError(
+                    "Mixed operators %s and %s" % (self.operator, other.operator)
+                )
+            return CompoundLocation(self.parts + other.parts, self.operator)
+        elif isinstance(other, int):
+            return self._shift(other)
+        else:
+            raise NotImplementedError
+
+    def __radd__(self, other):
+        """Add a feature to the left."""
+        if isinstance(other, FeatureLocation):
+            return CompoundLocation([other] + self.parts, self.operator)
+        elif isinstance(other, int):
+            return self._shift(other)
+        else:
+            raise NotImplementedError
+
+    def __contains__(self, value):
+        """Check if an integer position is within the CompoundLocation object."""
+        for loc in self.parts:
+            if value in loc:
+                return True
+        return False
+
+    def __nonzero__(self):
+        """Return True regardless of the length of the feature.
+
+        This behaviour is for backwards compatibility, since until the
+        __len__ method was added, a FeatureLocation always evaluated as True.
+
+        Note that in comparison, Seq objects, strings, lists, etc, will all
+        evaluate to False if they have length zero.
+
+        WARNING: The FeatureLocation may in future evaluate to False when its
+        length is zero (in order to better match normal python behaviour)!
+        """
+        return True
+
+    def __len__(self):
+        """Return the length of the CompoundLocation object."""
+        return sum(len(loc) for loc in self.parts)
+
+    def __iter__(self):
+        """Iterate over the parent positions within the CompoundLocation object."""
+        for loc in self.parts:
+            yield from loc
+
+    def __eq__(self, other):
+        """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation."""
+        if not isinstance(other, CompoundLocation):
+            return False
+        if len(self.parts) != len(other.parts):
+            return False
+        if self.operator != other.operator:
+            return False
+        for self_part, other_part in zip(self.parts, other.parts):
+            if self_part != other_part:
+                return False
+        return True
+
+    def _shift(self, offset):
+        """Return a copy of the CompoundLocation shifted by an offset (PRIVATE)."""
+        return CompoundLocation(
+            [loc._shift(offset) for loc in self.parts], self.operator
+        )
+
+    def _flip(self, length):
+        """Return a copy of the locations after the parent is reversed (PRIVATE).
+
+        Note that the order of the parts is NOT reversed too. Consider a CDS
+        on the forward strand with exons small, medium and large (in length).
+        Once we change the frame of reference to the reverse complement strand,
+        the start codon is still part of the small exon, and the stop codon
+        still part of the large exon - so the part order remains the same!
+
+        Here is an artificial example, were the features map to the two upper
+        case regions and the lower case runs of n are not used:
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqFeature import FeatureLocation
+        >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn")
+        >>> small = FeatureLocation(5, 20, strand=1)
+        >>> large = FeatureLocation(28, 52, strand=1)
+        >>> location = small + large
+        >>> print(small)
+        [5:20](+)
+        >>> print(large)
+        [28:52](+)
+        >>> print(location)
+        join{[5:20](+), [28:52](+)}
+        >>> for part in location.parts:
+        ...     print(len(part))
+        ...
+        15
+        24
+
+        As you can see, this is a silly example where each "exon" is a word:
+
+        >>> print(small.extract(dna).translate())
+        SILLY
+        >>> print(large.extract(dna).translate())
+        EXAMPLE*
+        >>> print(location.extract(dna).translate())
+        SILLYEXAMPLE*
+        >>> for part in location.parts:
+        ...     print(part.extract(dna).translate())
+        ...
+        SILLY
+        EXAMPLE*
+
+        Now, let's look at this from the reverse strand frame of reference:
+
+        >>> flipped_dna = dna.reverse_complement()
+        >>> flipped_location = location._flip(len(dna))
+        >>> print(flipped_location.extract(flipped_dna).translate())
+        SILLYEXAMPLE*
+        >>> for part in flipped_location.parts:
+        ...     print(part.extract(flipped_dna).translate())
+        ...
+        SILLY
+        EXAMPLE*
+
+        The key point here is the first part of the CompoundFeature is still the
+        small exon, while the second part is still the large exon:
+
+        >>> for part in flipped_location.parts:
+        ...     print(len(part))
+        ...
+        15
+        24
+        >>> print(flipped_location)
+        join{[37:52](-), [5:29](-)}
+
+        Notice the parts are not reversed. However, there was a bug here in older
+        versions of Biopython which would have given join{[5:29](-), [37:52](-)}
+        and the translation would have wrongly been "EXAMPLE*SILLY" instead.
+
+        """
+        return CompoundLocation(
+            [loc._flip(length) for loc in self.parts], self.operator
+        )
+
+    @property
+    def start(self):
+        """Start location - left most (minimum) value, regardless of strand.
+
+        Read only, returns an integer like position object, possibly a fuzzy
+        position.
+
+        For the special case of a CompoundLocation wrapping the origin of a
+        circular genome, this will return zero.
+        """
+        return min(loc.start for loc in self.parts)
+
+    @property
+    def end(self):
+        """End location - right most (maximum) value, regardless of strand.
+
+        Read only, returns an integer like position object, possibly a fuzzy
+        position.
+
+        For the special case of a CompoundLocation wrapping the origin of
+        a circular genome this will match the genome length (minus one
+        given how Python counts from zero).
+        """
+        return max(loc.end for loc in self.parts)
+
+    @property
+    def nofuzzy_start(self):
+        """Start position (integer, approximated if fuzzy, read only) (OBSOLETE).
+
+        This is an alias for int(feature.start), which should be used in
+        preference -- unless you are trying to support old versions of
+        Biopython.
+        """
+        try:
+            return int(self.start)
+        except TypeError:
+            if isinstance(self.start, UnknownPosition):
+                return None
+            raise
+
+    @property
+    def nofuzzy_end(self):
+        """End position (integer, approximated if fuzzy, read only) (OBSOLETE).
+
+        This is an alias for int(feature.end), which should be used in
+        preference -- unless you are trying to support old versions of
+        Biopython.
+        """
+        try:
+            return int(self.end)
+        except TypeError:
+            if isinstance(self.end, UnknownPosition):
+                return None
+            raise
+
+    @property
+    def ref(self):
+        """Not present in CompoundLocation, dummy method for API compatibility."""
+        return None
+
+    @property
+    def ref_db(self):
+        """Not present in CompoundLocation, dummy method for API compatibility."""
+        return None
+
+    def extract(self, parent_sequence, references=None):
+        """Extract the sequence from supplied parent sequence using the CompoundLocation object.
+
+        The parent_sequence can be a Seq like object or a string, and will
+        generally return an object of the same type. The exception to this is
+        a MutableSeq as the parent sequence will return a Seq object.
+        If the location refers to other records, they must be supplied
+        in the optional dictionary references.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation
+        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
+        >>> fl1 = FeatureLocation(2, 8)
+        >>> fl2 = FeatureLocation(10, 15)
+        >>> fl3 = CompoundLocation([fl1,fl2])
+        >>> fl3.extract(seq)
+        Seq('QHKAMILIVIC')
+
+        """
+        # This copes with mixed strand features & all on reverse:
+        parts = [
+            loc.extract(parent_sequence, references=references) for loc in self.parts
+        ]
+        f_seq = functools.reduce(lambda x, y: x + y, parts)
+        return f_seq
+
+
+class AbstractPosition:
+    """Abstract base class representing a position."""
+
+    def __repr__(self):
+        """Represent the AbstractPosition object as a string for debugging."""
+        return "%s(...)" % (self.__class__.__name__)
+
+
+class ExactPosition(int, AbstractPosition):
+    """Specify the specific position of a boundary.
+
+    Arguments:
+     - position - The position of the boundary.
+     - extension - An optional argument which must be zero since we don't
+       have an extension. The argument is provided so that the same number
+       of arguments can be passed to all position types.
+
+    In this case, there is no fuzziness associated with the position.
+
+    >>> p = ExactPosition(5)
+    >>> p
+    ExactPosition(5)
+    >>> print(p)
+    5
+
+    >>> isinstance(p, AbstractPosition)
+    True
+    >>> isinstance(p, int)
+    True
+
+    Integer comparisons and operations should work as expected:
+
+    >>> p == 5
+    True
+    >>> p < 6
+    True
+    >>> p <= 5
+    True
+    >>> p + 10
+    15
+
+    """
+
+    def __new__(cls, position, extension=0):
+        """Create an ExactPosition object."""
+        if extension != 0:
+            raise AttributeError(
+                "Non-zero extension %s for exact position." % extension
+            )
+        return int.__new__(cls, position)
+
+    # Must define this on Python 3.8 onwards because we redefine __repr__
+    def __str__(self):
+        """Return a representation of the ExactPosition object (with python counting)."""
+        return str(int(self))
+
+    def __repr__(self):
+        """Represent the ExactPosition object as a string for debugging."""
+        return "%s(%i)" % (self.__class__.__name__, int(self))
+
+    @property
+    def position(self):
+        """Legacy attribute to get position as integer (OBSOLETE)."""
+        return int(self)
+
+    @property
+    def extension(self):
+        """Not present in this object, return zero (OBSOLETE)."""
+        return 0
+
+    def _shift(self, offset):
+        """Return a copy of the position object with its location shifted (PRIVATE)."""
+        # By default preserve any subclass
+        return self.__class__(int(self) + offset)
+
+    def _flip(self, length):
+        """Return a copy of the location after the parent is reversed (PRIVATE)."""
+        # By default preserve any subclass
+        return self.__class__(length - int(self))
+
+
+class UncertainPosition(ExactPosition):
+    """Specify a specific position which is uncertain.
+
+    This is used in UniProt, e.g. ?222 for uncertain position 222, or in the
+    XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL.
+    """
+
+    pass
+
+
+class UnknownPosition(AbstractPosition):
+    """Specify a specific position which is unknown (has no position).
+
+    This is used in UniProt, e.g. ? or in the XML as unknown.
+    """
+
+    def __repr__(self):
+        """Represent the UnknownPosition object as a string for debugging."""
+        return "%s()" % self.__class__.__name__
+
+    def __hash__(self):
+        """Return the hash value of the UnknownPosition object."""
+        return hash(None)
+
+    @property
+    def position(self):
+        """Legacy attribute to get location (None) (OBSOLETE)."""
+        return None
+
+    @property
+    def extension(self):  # noqa: D402
+        """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""  # noqa: D402
+        return 0
+
+    def _shift(self, offset):
+        """Return a copy of the position object with its location shifted (PRIVATE)."""
+        return self
+
+    def _flip(self, length):
+        """Return a copy of the location after the parent is reversed (PRIVATE)."""
+        return self
+
+
+class WithinPosition(int, AbstractPosition):
+    """Specify the position of a boundary within some coordinates.
+
+    Arguments:
+    - position - The default integer position
+    - left - The start (left) position of the boundary
+    - right - The end (right) position of the boundary
+
+    This allows dealing with a location like ((1.4)..100). This
+    indicates that the start of the sequence is somewhere between 1
+    and 4. Since this is a start coordinate, it should acts like
+    it is at position 1 (or in Python counting, 0).
+
+    >>> p = WithinPosition(10, 10, 13)
+    >>> p
+    WithinPosition(10, left=10, right=13)
+    >>> print(p)
+    (10.13)
+    >>> int(p)
+    10
+
+    Basic integer comparisons and operations should work as though
+    this were a plain integer:
+
+    >>> p == 10
+    True
+    >>> p in [9, 10, 11]
+    True
+    >>> p < 11
+    True
+    >>> p + 10
+    20
+
+    >>> isinstance(p, WithinPosition)
+    True
+    >>> isinstance(p, AbstractPosition)
+    True
+    >>> isinstance(p, int)
+    True
+
+    Note this also applies for comparison to other position objects,
+    where again the integer behaviour is used:
+
+    >>> p == 10
+    True
+    >>> p == ExactPosition(10)
+    True
+    >>> p == BeforePosition(10)
+    True
+    >>> p == AfterPosition(10)
+    True
+
+    If this were an end point, you would want the position to be 13:
+
+    >>> p2 = WithinPosition(13, 10, 13)
+    >>> p2
+    WithinPosition(13, left=10, right=13)
+    >>> print(p2)
+    (10.13)
+    >>> int(p2)
+    13
+    >>> p2 == 13
+    True
+    >>> p2 == ExactPosition(13)
+    True
+
+    The old legacy properties of position and extension give the
+    starting/lower/left position as an integer, and the distance
+    to the ending/higher/right position as an integer. Note that
+    the position object will act like either the left or the right
+    end-point depending on how it was created:
+
+    >>> p.position == p2.position == 10
+    True
+    >>> p.extension == p2.extension == 3
+    True
+    >>> int(p) == int(p2)
+    False
+    >>> p == 10
+    True
+    >>> p2 == 13
+    True
+
+    """
+
+    def __new__(cls, position, left, right):
+        """Create a WithinPosition object."""
+        if not (position == left or position == right):
+            raise RuntimeError(
+                "WithinPosition: %r should match left %r or "
+                "right %r" % (position, left, right)
+            )
+        obj = int.__new__(cls, position)
+        obj._left = left
+        obj._right = right
+        return obj
+
+    def __getnewargs__(self):
+        """Return the arguments accepted by __new__.
+
+        Necessary to allow pickling and unpickling of class instances.
+        """
+        return (int(self), self._left, self._right)
+
+    def __repr__(self):
+        """Represent the WithinPosition object as a string for debugging."""
+        return "%s(%i, left=%i, right=%i)" % (
+            self.__class__.__name__,
+            int(self),
+            self._left,
+            self._right,
+        )
+
+    def __str__(self):
+        """Return a representation of the WithinPosition object (with python counting)."""
+        return "(%s.%s)" % (self._left, self._right)
+
+    @property
+    def position(self):
+        """Legacy attribute to get (left) position as integer (OBSOLETE)."""
+        return self._left
+
+    @property
+    def extension(self):  # noqa: D402
+        """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE)."""  # noqa: D402
+        return self._right - self._left
+
+    def _shift(self, offset):
+        """Return a copy of the position object with its location shifted (PRIVATE)."""
+        return self.__class__(
+            int(self) + offset, self._left + offset, self._right + offset
+        )
+
+    def _flip(self, length):
+        """Return a copy of the location after the parent is reversed (PRIVATE)."""
+        return self.__class__(
+            length - int(self), length - self._right, length - self._left
+        )
+
+
+class BetweenPosition(int, AbstractPosition):
+    """Specify the position of a boundary between two coordinates (OBSOLETE?).
+
+    Arguments:
+     - position - The default integer position
+     - left - The start (left) position of the boundary
+     - right - The end (right) position of the boundary
+
+    This allows dealing with a position like 123^456. This
+    indicates that the start of the sequence is somewhere between
+    123 and 456. It is up to the parser to set the position argument
+    to either boundary point (depending on if this is being used as
+    a start or end of the feature). For example as a feature end:
+
+    >>> p = BetweenPosition(456, 123, 456)
+    >>> p
+    BetweenPosition(456, left=123, right=456)
+    >>> print(p)
+    (123^456)
+    >>> int(p)
+    456
+
+    Integer equality and comparison use the given position,
+
+    >>> p == 456
+    True
+    >>> p in [455, 456, 457]
+    True
+    >>> p > 300
+    True
+
+    The old legacy properties of position and extension give the
+    starting/lower/left position as an integer, and the distance
+    to the ending/higher/right position as an integer. Note that
+    the position object will act like either the left or the right
+    end-point depending on how it was created:
+
+    >>> p2 = BetweenPosition(123, left=123, right=456)
+    >>> p.position == p2.position == 123
+    True
+    >>> p.extension
+    333
+    >>> p2.extension
+    333
+    >>> p.extension == p2.extension == 333
+    True
+    >>> int(p) == int(p2)
+    False
+    >>> p == 456
+    True
+    >>> p2 == 123
+    True
+
+    Note this potentially surprising behaviour:
+
+    >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123)
+    True
+    >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123)
+    True
+    >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123)
+    True
+
+    i.e. For equality (and sorting) the position objects behave like
+    integers.
+
+    """
+
+    def __new__(cls, position, left, right):
+        """Create a new instance in BetweenPosition object."""
+        assert position == left or position == right
+        obj = int.__new__(cls, position)
+        obj._left = left
+        obj._right = right
+        return obj
+
+    def __getnewargs__(self):
+        """Return the arguments accepted by __new__.
+
+        Necessary to allow pickling and unpickling of class instances.
+        """
+        return (int(self), self._left, self._right)
+
+    def __repr__(self):
+        """Represent the BetweenPosition object as a string for debugging."""
+        return "%s(%i, left=%i, right=%i)" % (
+            self.__class__.__name__,
+            int(self),
+            self._left,
+            self._right,
+        )
+
+    def __str__(self):
+        """Return a representation of the BetweenPosition object (with python counting)."""
+        return "(%s^%s)" % (self._left, self._right)
+
+    @property
+    def position(self):
+        """Legacy attribute to get (left) position as integer (OBSOLETE)."""
+        return self._left
+
+    @property
+    def extension(self):  # noqa: D402
+        """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE)."""  # noqa: D402
+        return self._right - self._left
+
+    def _shift(self, offset):
+        """Return a copy of the position object with its location shifted (PRIVATE)."""
+        return self.__class__(
+            int(self) + offset, self._left + offset, self._right + offset
+        )
+
+    def _flip(self, length):
+        """Return a copy of the location after the parent is reversed (PRIVATE)."""
+        return self.__class__(
+            length - int(self), length - self._right, length - self._left
+        )
+
+
+class BeforePosition(int, AbstractPosition):
+    """Specify a position where the actual location occurs before it.
+
+    Arguments:
+     - position - The upper boundary of where the location can occur.
+     - extension - An optional argument which must be zero since we don't
+       have an extension. The argument is provided so that the same number
+       of arguments can be passed to all position types.
+
+    This is used to specify positions like (<10..100) where the location
+    occurs somewhere before position 10.
+
+    >>> p = BeforePosition(5)
+    >>> p
+    BeforePosition(5)
+    >>> print(p)
+    <5
+    >>> int(p)
+    5
+    >>> p + 10
+    15
+
+    Note this potentially surprising behaviour:
+
+    >>> p == ExactPosition(5)
+    True
+    >>> p == AfterPosition(5)
+    True
+
+    Just remember that for equality and sorting the position objects act
+    like integers.
+    """
+
+    # Subclasses int so can't use __init__
+    def __new__(cls, position, extension=0):
+        """Create a new instance in BeforePosition object."""
+        if extension != 0:
+            raise AttributeError(
+                "Non-zero extension %s for exact position." % extension
+            )
+        return int.__new__(cls, position)
+
+    @property
+    def position(self):
+        """Legacy attribute to get position as integer (OBSOLETE)."""
+        return int(self)
+
+    @property
+    def extension(self):  # noqa: D402
+        """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""  # noqa: D402
+        return 0
+
+    def __repr__(self):
+        """Represent the location as a string for debugging."""
+        return "%s(%i)" % (self.__class__.__name__, int(self))
+
+    def __str__(self):
+        """Return a representation of the BeforePosition object (with python counting)."""
+        return "<%s" % self.position
+
+    def _shift(self, offset):
+        """Return a copy of the position object with its location shifted (PRIVATE)."""
+        return self.__class__(int(self) + offset)
+
+    def _flip(self, length):
+        """Return a copy of the location after the parent is reversed (PRIVATE)."""
+        return AfterPosition(length - int(self))
+
+
+class AfterPosition(int, AbstractPosition):
+    """Specify a position where the actual location is found after it.
+
+    Arguments:
+     - position - The lower boundary of where the location can occur.
+     - extension - An optional argument which must be zero since we don't
+       have an extension. The argument is provided so that the same number
+       of arguments can be passed to all position types.
+
+    This is used to specify positions like (>10..100) where the location
+    occurs somewhere after position 10.
+
+    >>> p = AfterPosition(7)
+    >>> p
+    AfterPosition(7)
+    >>> print(p)
+    >7
+    >>> int(p)
+    7
+    >>> p + 10
+    17
+
+    >>> isinstance(p, AfterPosition)
+    True
+    >>> isinstance(p, AbstractPosition)
+    True
+    >>> isinstance(p, int)
+    True
+
+    Note this potentially surprising behaviour:
+
+    >>> p == ExactPosition(7)
+    True
+    >>> p == BeforePosition(7)
+    True
+
+    Just remember that for equality and sorting the position objects act
+    like integers.
+    """
+
+    # Subclasses int so can't use __init__
+    def __new__(cls, position, extension=0):
+        """Create a new instance of the AfterPosition object."""
+        if extension != 0:
+            raise AttributeError(
+                "Non-zero extension %s for exact position." % extension
+            )
+        return int.__new__(cls, position)
+
+    @property
+    def position(self):
+        """Legacy attribute to get position as integer (OBSOLETE)."""
+        return int(self)
+
+    @property
+    def extension(self):  # noqa: D402
+        """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""  # noqa: D402
+        return 0
+
+    def __repr__(self):
+        """Represent the location as a string for debugging."""
+        return "%s(%i)" % (self.__class__.__name__, int(self))
+
+    def __str__(self):
+        """Return a representation of the AfterPosition object (with python counting)."""
+        return ">%s" % self.position
+
+    def _shift(self, offset):
+        """Return a copy of the position object with its location shifted (PRIVATE)."""
+        return self.__class__(int(self) + offset)
+
+    def _flip(self, length):
+        """Return a copy of the location after the parent is reversed (PRIVATE)."""
+        return BeforePosition(length - int(self))
+
+
+class OneOfPosition(int, AbstractPosition):
+    """Specify a position where the location can be multiple positions.
+
+    This models the GenBank 'one-of(1888,1901)' function, and tries
+    to make this fit within the Biopython Position models. If this was
+    a start position it should act like 1888, but as an end position 1901.
+
+    >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)])
+    >>> p
+    OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)])
+    >>> int(p)
+    1888
+
+    Integer comparisons and operators act like using int(p),
+
+    >>> p == 1888
+    True
+    >>> p <= 1888
+    True
+    >>> p > 1888
+    False
+    >>> p + 100
+    1988
+
+    >>> isinstance(p, OneOfPosition)
+    True
+    >>> isinstance(p, AbstractPosition)
+    True
+    >>> isinstance(p, int)
+    True
+
+    The old legacy properties of position and extension give the
+    starting/lowest/left-most position as an integer, and the
+    distance to the ending/highest/right-most position as an integer.
+    Note that the position object will act like one of the list of
+    possible locations depending on how it was created:
+
+    >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)])
+    >>> p.position == p2.position == 1888
+    True
+    >>> p.extension == p2.extension == 13
+    True
+    >>> int(p) == int(p2)
+    False
+    >>> p == 1888
+    True
+    >>> p2 == 1901
+    True
+
+    """
+
+    def __new__(cls, position, choices):
+        """Initialize with a set of possible positions.
+
+        position_list is a list of AbstractPosition derived objects,
+        specifying possible locations.
+
+        position is an integer specifying the default behaviour.
+        """
+        if position not in choices:
+            raise ValueError(
+                "OneOfPosition: %r should match one of %r" % (position, choices)
+            )
+        obj = int.__new__(cls, position)
+        obj.position_choices = choices
+        return obj
+
+    def __getnewargs__(self):
+        """Return the arguments accepted by __new__.
+
+        Necessary to allow pickling and unpickling of class instances.
+        """
+        return (int(self), self.position_choices)
+
+    @property
+    def position(self):
+        """Legacy attribute to get (left) position as integer (OBSOLETE)."""
+        return min(int(pos) for pos in self.position_choices)
+
+    @property
+    def extension(self):
+        """Legacy attribute to get extension as integer (OBSOLETE)."""
+        positions = [int(pos) for pos in self.position_choices]
+        return max(positions) - min(positions)
+
+    def __repr__(self):
+        """Represent the OneOfPosition object as a string for debugging."""
+        return "%s(%i, choices=%r)" % (
+            self.__class__.__name__,
+            int(self),
+            self.position_choices,
+        )
+
+    def __str__(self):
+        """Return a representation of the OneOfPosition object (with python counting)."""
+        out = "one-of("
+        for position in self.position_choices:
+            out += "%s," % position
+        # replace the last comma with the closing parenthesis
+        return out[:-1] + ")"
+
+    def _shift(self, offset):
+        """Return a copy of the position object with its location shifted (PRIVATE)."""
+        return self.__class__(
+            int(self) + offset, [p._shift(offset) for p in self.position_choices]
+        )
+
+    def _flip(self, length):
+        """Return a copy of the location after the parent is reversed (PRIVATE)."""
+        return self.__class__(
+            length - int(self), [p._flip(length) for p in self.position_choices[::-1]]
+        )
+
+
+class PositionGap:
+    """Simple class to hold information about a gap between positions."""
+
+    def __init__(self, gap_size):
+        """Intialize with a position object containing the gap information."""
+        self.gap_size = gap_size
+
+    def __repr__(self):
+        """Represent the position gap as a string for debugging."""
+        return "%s(%r)" % (self.__class__.__name__, self.gap_size)
+
+    def __str__(self):
+        """Return a representation of the PositionGap object (with python counting)."""
+        return "gap(%s)" % self.gap_size
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqIO/AbiIO.py b/code/lib/Bio/SeqIO/AbiIO.py
new file mode 100644
index 0000000..31ca8cc
--- /dev/null
+++ b/code/lib/Bio/SeqIO/AbiIO.py
@@ -0,0 +1,602 @@
+# Copyright 2011 by Wibowo Arindrarto (w.arindrarto@gmail.com)
+# Revisions copyright 2011-2016 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO parser for the ABI format.
+
+ABI is the format used by Applied Biosystem's sequencing machines to store
+sequencing results.
+
+For more details on the format specification, visit:
+http://www6.appliedbiosystems.com/support/software_community/ABIF_File_Format.pdf
+
+"""
+import datetime
+import struct
+import sys
+
+from os.path import basename
+
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+
+
+# dictionary for determining which tags goes into SeqRecord annotation
+# each key is tag_name + tag_number
+# if a tag entry needs to be added, just add its key and its key
+# for the annotations dictionary as the value
+# dictionary for tags that require preprocessing before use in creating
+# seqrecords
+_EXTRACT = {
+    "TUBE1": "sample_well",
+    "DySN1": "dye",
+    "GTyp1": "polymer",
+    "MODL1": "machine_model",
+}
+
+
+# Complete data structure representing 98% of the API. The general section
+# represents the part of the API that's common to ALL instruments, whereas the
+# instrument specific sections are labelled as they are in the ABIF spec
+#
+# Keys don't seem to clash from machine to machine, so when we parse, we look
+# for ANY key, and store that in the raw ABIF data structure attached to the
+# annotations, with the assumption that anyone parsing the data can look up the
+# spec themself
+#
+# Key definitions are retained in case end users want "nice" labels pre-made
+# for them for all of the available fields.
+_INSTRUMENT_SPECIFIC_TAGS = {}
+
+# fmt: off
+_INSTRUMENT_SPECIFIC_TAGS["general"] = {
+    "APFN2": "Sequencing Analysis parameters file name",
+    "APXV1": "Analysis Protocol XML schema version",
+    "APrN1": "Analysis Protocol settings name",
+    "APrV1": "Analysis Protocol settings version",
+    "APrX1": "Analysis Protocol XML string",
+    "CMNT1": "Sample Comment",
+    "CTID1": "Container Identifier, a.k.a. plate barcode",
+    "CTNM1": "Container name, usually identical to CTID, but not necessarily so",
+    "CTTL1": "Comment Title",
+    "CpEP1": "Capillary type electrophoresis. 1 for a capillary based machine. 0 for a slab gel based machine.",
+    "DATA1": "Channel 1 raw data",
+    "DATA2": "Channel 2 raw data",
+    "DATA3": "Channel 3 raw data",
+    "DATA4": "Channel 4 raw data",
+    "DATA5": "Short Array holding measured volts/10 (EP voltage) during run",
+    "DATA6": "Short Array holding measured milliAmps trace (EP current) during run",
+    "DATA7": "Short Array holding measured milliWatts trace (Laser EP Power) during run",
+    "DATA8": "Short Array holding measured oven Temperature (polymer temperature) trace during run",
+    "DATA9": "Channel 9 processed data",
+    "DATA10": "Channel 10 processed data",
+    "DATA11": "Channel 11 processed data",
+    "DATA12": "Channel 12 processed data",
+    # Prism 3100/3100-Avant may provide DATA105
+    #          3130/3130-XL may provide DATA105
+    # 3530/3530-XL may provide DATA105-199, 9-12, 205-299
+    "DSam1": "Downsampling factor",
+    "DySN1": "Dye set name",
+    "Dye#1": "Number of dyes",
+    "DyeN1": "Dye 1 name",
+    "DyeN2": "Dye 2 name",
+    "DyeN3": "Dye 3 name",
+    "DyeN4": "Dye 4 name",
+    "DyeW1": "Dye 1 wavelength",
+    "DyeW2": "Dye 2 wavelength",
+    "DyeW3": "Dye 3 wavelength",
+    "DyeW4": "Dye 4 wavelength",
+    # 'DyeN5-N': 'Dye 5-N Name',
+    # 'DyeW5-N': 'Dye 5-N Wavelength',
+    "EPVt1": "Electrophoresis voltage setting (volts)",
+    "EVNT1": "Start Run event",
+    "EVNT2": "Stop Run event",
+    "EVNT3": "Start Collection event",
+    "EVNT4": "Stop Collection event",
+    "FWO_1": 'Base Order. Sequencing Analysis Filter wheel order. Fixed for 3500 at "GATC"',
+    "GTyp1": "Gel or polymer Type",
+    "InSc1": "Injection time (seconds)",
+    "InVt1": "Injection voltage (volts)",
+    "LANE1": "Lane/Capillary",
+    "LIMS1": "Sample tracking ID",
+    "LNTD1": "Length to detector",
+    "LsrP1": "Laser Power setting (micro Watts)",
+    "MCHN1": "Instrument name and serial number",
+    "MODF1": "Data collection module file",
+    "MODL1": "Model number",
+    "NAVG1": "Pixels averaged per lane",
+    "NLNE1": "Number of capillaries",
+    "OfSc1": "List of scans that are marked off scale in Collection. (optional)",
+    # OvrI and OrvV are listed as "1-N", and "One for each dye (unanalyzed
+    # and/or analyzed data)"
+    "OvrI1": "List of scan number indexes that have values greater than 32767 but did not "
+             "saturate the camera. In Genemapper samples, this can have indexes with "
+             "values greater than 32000. In sequencing samples, this cannot have "
+             "indexes with values greater than 32000.",
+    "OvrI2": "List of scan number indexes that have values greater than 32767 but did not "
+             "saturate the camera. In Genemapper samples, this can have indexes with "
+             "values greater than 32000. In sequencing samples, this cannot have "
+             "indexes with values greater than 32000.",
+    "OvrI3": "List of scan number indexes that have values greater than 32767 but did not "
+             "saturate the camera. In Genemapper samples, this can have indexes with "
+             "values greater than 32000. In sequencing samples, this cannot have "
+             "indexes with values greater than 32000.",
+    "OvrI4": "List of scan number indexes that have values greater than 32767 but did not "
+             "saturate the camera. In Genemapper samples, this can have indexes with "
+             "values greater than 32000. In sequencing samples, this cannot have "
+             "indexes with values greater than 32000.",
+    "OvrV1": "List of color data values found at the locations listed in the OvrI tag. "
+             "There must be exactly as many numbers in this array as in the OvrI array.",
+    "OvrV2": "List of color data values found at the locations listed in the OvrI tag. "
+             "There must be exactly as many numbers in this array as in the OvrI array.",
+    "OvrV3": "List of color data values found at the locations listed in the OvrI tag. "
+             "There must be exactly as many numbers in this array as in the OvrI array.",
+    "OvrV4": "List of color data values found at the locations listed in the OvrI tag. "
+             "There must be exactly as many numbers in this array as in the OvrI array.",
+    "PDMF1": "Sequencing Analysis Mobility file name chosen in collection",
+    "RMXV1": "Run Module XML schema version",
+    "RMdN1": "Run Module name (same as MODF)",
+    "RMdX1": "Run Module XML string",
+    "RPrN1": "Run Protocol name",
+    "RPrV1": "Run Protocol version",
+    "RUND1": "Run Started Date",
+    "RUND2": "Run Stopped Date",
+    "RUND3": "Data Collection Started Date",
+    "RUND4": "Data Collection Stopped date",
+    "RUNT1": "Run Started Time",
+    "RUNT2": "Run Stopped Time",
+    "RUNT3": "Data Collection Started Time",
+    "RUNT4": "Data Collection Stopped Time",
+    "Rate1": "Scanning Rate. Milliseconds per frame.",
+    "RunN1": "Run Name",
+    "SCAN1": "Number of scans",
+    "SMED1": "Polymer lot expiration date",
+    "SMLt1": "Polymer lot number",
+    "SMPL1": "Sample name",
+    "SVER1": "Data collection software version",
+    "SVER3": "Data collection firmware version",
+    "Satd1": "Array of longs representing the scan numbers of data points, which are flagged as saturated by data collection (optional)",
+    "Scal1": "Rescaling divisor for color data",
+    "Scan1": "Number of scans (legacy - use SCAN)",
+    "TUBE1": "Well ID",
+    "Tmpr1": "Run temperature setting",
+    "User1": "Name of user who created the plate (optional)",
+}
+
+#  No instrument specific tags
+# _INSTRUMENT_SPECIFIC_TAGS['abi_prism_3100/3100-Avant'] = {
+# }
+
+_INSTRUMENT_SPECIFIC_TAGS["abi_3130/3130xl"] = {
+    "CTOw1": "Container owner",
+    "HCFG1": "Instrument Class",
+    "HCFG2": "Instrument Family",
+    "HCFG3": "Official Instrument Name",
+    "HCFG4": "Instrument Parameters",
+    "RMdVa1": "Run Module version",
+}
+
+_INSTRUMENT_SPECIFIC_TAGS["abi_3530/3530xl"] = {
+    "AAct1": "Primary Analysis Audit Active indication. True if system auditing was enabled during the last write of this file, "
+             "false if system auditing was disabled.",
+    "ABED1": "Anode buffer expiration date using ISO 8601 format using the patterns YYYY-MM-DDTHH:MM:SS.ss+/-HH:MM. Hundredths of a second are optional.",
+    "ABID1": "Anode buffer tray first installed date",
+    "ABLt1": "Anode buffer lot number",
+    "ABRn1": "Number of runs (injections) processed with the current Anode Buffer (runs allowed - runs remaining)",
+    "ABTp1": "Anode buffer type",
+    "AEPt1": "Analysis Ending scan number for basecalling on initial analysis",
+    "AEPt2": "Analysis Ending scan number for basecalling on last analysis",
+    "APCN1": "Amplicon name",
+    "ARTN1": "Analysis Return code. Produced only by 5 Prime basecaller 1.0b3",
+    "ASPF1": "Flag to indicate whether adaptive processing worked or not",
+    "ASPt1": "Analysis Starting scan number for first analysis",
+    "ASPt2": "Analysis Starting scan number for last analysis",
+    "AUDT2": "Audit log used across 3500 software (optional)",
+    "AVld1": "Assay validation flag (true or false)",
+    "AmbT1": "Record of ambient temperature readings",
+    "AsyC1": "The assay contents (xml format)",
+    "AsyN1": "The assay name",
+    "AsyV1": "The assay version",
+    "B1Pt1": "Reference scan number for mobility and spacing curves for first analysis",
+    "B1Pt2": "Reference scan number for mobility and spacing curves for last analysis",
+    "BCTS1": "Basecaller timestamp. Time of completion of most recent analysis",
+    "BcRn1": "Basecalling qc code",
+    "BcRs1": "Basecalling warnings, a concatenated comma separated string",
+    "BcRs2": "Basecalling errors, a concatenated comma separated string",
+    "CAED1": "Capillary array expiration",
+    "CALt1": "Capillary array lot number",
+    "CARn1": "Number of injections processed (including the one of which this sample was a part) through the capillary array",
+    "CASN1": "Capillary array serial number",
+    "CBED1": "Cathode buffer expiration date",
+    "CBID1": "Cathode buffer tray first installed date",
+    "CBLt1": "Cathode buffer lot number",
+    "CBRn1": "Number of runs (injections) processed with the current Cathode Buffer (runs allowed - runs remaining)",
+    "CBTp1": "Cathode buffer type",
+    "CLRG1": "Start of the clear range (inclusive).",
+    "CLRG2": "Clear range length",
+    "CRLn1": "Contiguous read length",
+    "CRLn2": 'One of "Pass", "Fail", or "Check"',
+    "CTOw1": "The name entered as the Owner of a plate, in the plate editor",
+    "CkSm1": "File checksum",
+    "DCEv1": "A list of door-close events, separated by semicolon. Door open events are generally paired with door close events.",
+    "DCHT1": "Reserved for backward compatibility. The detection cell heater temperature setting from the Run Module. Not used for 3500.",
+    "DOEv1": "A list of door-open events, separated by semicolon. Door close events are generally paired with door open events.",
+    "ESig2": "Electronic signature record used across 3500 software",
+    "FTab1": "Feature table. Can be created by Nibbler for Clear Range.",
+    "FVoc1": "Feature table vocabulary. Can be created by Nibbler for Clear Range.",
+    "Feat1": "Features. Can be created by Nibbler for Clear Range.",
+    "HCFG1": "The Instrument Class. All upper case, no spaces. Initial valid value: CE",
+    "HCFG2": "The Instrument Family. All upper case, no spaces. Valid values: 31XX or 37XX for UDC, 35XX (for 3500)",
+    "HCFG3": "The official instrument name. Mixed case, minus any special formatting. Initial valid values: 3130, 3130xl, 3730, 3730xl, 3500, 3500xl.",
+    "HCFG4": "Instrument parameters. Contains key-value pairs of instrument configuration information, separated by semicolons. "
+             "Four parameters are included initially: UnitID=, CPUBoard=, "
+             "ArraySize=<# of capillaries>, SerialNumber=.",
+    "InjN1": "Injection name",
+    "LAST1": "Parameter settings information",
+    "NOIS1": "The estimate of rms baseline noise (S/N ratio) for each dye for a successfully analyzed sample. "
+             "Corresponds in order to the raw data in tags DATA 1-4. KB basecaller only.",
+    "P1AM1": "Amplitude of primary peak, which is not necessarily equal to corresponding signal strength at that position",
+    "P1RL1": "Deviation of primary peak position from (PLoc,2), times 100, rounded to integer",
+    "P1WD1": "Full-width Half-max of primary peak, times 100, rounded to integer. "
+             "Corresponding signal intensity is not necessarily equal to one half of primary peak amplitude",
+    "P2AM1": "Amplitude of secondary peak, which is not necessarily equal to corresponding signal strength at that position",
+    "P2BA1": "Base of secondary peak",
+    "P2RL1": "Deviation of secondary peak position from (PLoc,2), times 100, rounded to integer",
+    "PBAS1": "Array of sequence characters edited by user",
+    "PBAS2": "Array of sequence characters as called by Basecaller",
+    "PCON1": "Array of quality Values (0-255) as edited by user",
+    "PCON2": "Array of quality values (0-255) as called by Basecaller",
+    "PDMF2": "Mobility file name chosen in most recent analysis (identical to PDMF1)",
+    "PLOC1": "Array of peak locations edited by user",
+    "PLOC2": "Array of peak locations as called by Basecaller",
+    "PRJT1": "SeqScape 2.0 project template name",
+    "PROJ4": "SeqScape 2.0 project name",
+    "PSZE1": "Plate size. The number of sample positions in the container. Current allowed values: 96, 384.",
+    "PTYP1": "Plate type. Current allowed values: 96-Well, 384-Well.",
+    "PuSc1": "Median pupscore",
+    "QV201": "QV20+ value",
+    "QV202": 'One of "Pass", "Fail", or "Check"',
+    "QcPa1": "QC parameters",
+    "QcRn1": "Trimming and QC code",
+    "QcRs1": "QC warnings, a concatenated comma separated string",
+    "QcRs2": "QC errors, a concatenated comma separated string",
+    "RGOw1": "The name entered as the Owner of a Results Group, in the Results Group Editor. Implemented as the user name from the results group.",
+    "RInj1": "Reinjection number. The reinjection number that this sample belongs to. Not present if there was no reinjection.",
+    "RNmF1": "Raman normalization factor",
+    "RevC1": "for whether the sequence has been complemented",
+    "RunN1": "Run name (which, for 3500, is different from injection name)",
+    "S/N%1": "Signal strength for each dye",
+    "SMID1": "Polymer first installed date",
+    "SMRn1": "Number of runs (injections) processed with the current polymer (runs allowed - runs remaining)",
+    "SPAC1": "Average peak spacing used in last analysis",
+    "SPAC2": "Basecaller name - corresponds to name of bcp file.",
+    "SPAC3": "Average peak spacing last calculated by the Basecaller.",
+    "SPEC1": "Sequencing Analysis Specimen Name",
+    "SVER2": "Basecaller version number",
+    "SVER4": "Sample File Format Version String",
+    "ScPa1": "The parameter string of size caller",
+    "ScSt1": "Raw data start point. Set to 0 for 3500 data collection.",
+    "SpeN1": "Active spectral calibration name",
+    "TrPa1": "Timming parameters",
+    "TrSc1": "Trace score.",
+    "TrSc2": 'One of "Pass", "Fail", or "Check"',
+    "phAR1": "Trace peak aria ratio",
+    "phCH1": 'Chemistry type ("term", "prim", "unknown"), based on DYE_1 information',
+    "phDY1": 'Dye ("big", "d-rhod", "unknown"), based on mob file information',
+    "phQL1": "Maximum Quality Value",
+    "phTR1": "Set Trim region",
+    "phTR2": "Trim probability",
+}
+
+_INSTRUMENT_SPECIFIC_TAGS["abi_3730/3730xl"] = {
+    "BufT1": "Buffer tray heater temperature (degrees C)",
+}
+# fmt: on
+
+# dictionary for data unpacking format
+_BYTEFMT = {
+    1: "b",  # byte
+    2: "s",  # char
+    3: "H",  # word
+    4: "h",  # short
+    5: "i",  # long
+    6: "2i",  # rational, legacy unsupported
+    7: "f",  # float
+    8: "d",  # double
+    10: "h2B",  # date
+    11: "4B",  # time
+    12: "2i2b",  # thumb
+    13: "B",  # bool
+    14: "2h",  # point, legacy unsupported
+    15: "4h",  # rect, legacy unsupported
+    16: "2i",  # vPoint, legacy unsupported
+    17: "4i",  # vRect, legacy unsupported
+    18: "s",  # pString
+    19: "s",  # cString
+    20: "2i",  # tag, legacy unsupported
+}
+# header data structure (excluding 4 byte ABIF marker)
+_HEADFMT = ">H4sI2H3I"
+# directory data structure
+_DIRFMT = ">4sI2H4I"
+
+__global_tag_listing = []
+for tag in _INSTRUMENT_SPECIFIC_TAGS.values():
+    __global_tag_listing += tag.keys()
+
+
+def _get_string_tag(opt_bytes_value, default=None):
+    """Return the string value of the given an optional raw bytes tag value.
+
+    If the bytes value is None, return the given default value.
+
+    """
+    if opt_bytes_value is None:
+        return default
+    try:
+        return opt_bytes_value.decode()
+    except UnicodeDecodeError:
+        return opt_bytes_value.decode(encoding=sys.getdefaultencoding())
+
+
+class AbiIterator(SequenceIterator):
+    """Parser for Abi files."""
+
+    def __init__(self, source, trim=False):
+        """Return an iterator for the Abi file format."""
+        self.trim = trim
+        super().__init__(source, mode="b", fmt="ABI")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        # check if input file is a valid Abi file
+        marker = handle.read(4)
+        if not marker:
+            # handle empty file gracefully
+            raise ValueError("Empty file.")
+
+        if marker != b"ABIF":
+            raise OSError("File should start ABIF, not %r" % marker)
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        # dirty hack for handling time information
+        times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}
+
+        # initialize annotations
+        annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))
+
+        # parse header and extract data from directories
+        header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))
+
+        # Set default sample ID value, which we expect to be present in most
+        # cases in the SMPL1 tag, but may be missing.
+        sample_id = ""
+
+        raw = {}
+        seq = qual = None
+        for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
+            key = tag_name + str(tag_number)
+
+            raw[key] = tag_data
+
+            # PBAS2 is base-called sequence, only available in 3530
+            if key == "PBAS2":
+                seq = tag_data.decode()
+            # PCON2 is quality values of base-called sequence
+            elif key == "PCON2":
+                qual = [ord(val) for val in tag_data.decode()]
+            # SMPL1 is sample id entered before sequencing run, it must be
+            # a string.
+            elif key == "SMPL1":
+                sample_id = _get_string_tag(tag_data)
+            elif key in times:
+                times[key] = tag_data
+            else:
+                if key in _EXTRACT:
+                    annot[_EXTRACT[key]] = tag_data
+
+        # set time annotations
+        annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"])
+        annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"])
+
+        # raw data (for advanced end users benefit)
+        annot["abif_raw"] = raw
+
+        # fsa check
+        is_fsa_file = all(tn not in raw for tn in ("PBAS1", "PBAS2"))
+
+        if is_fsa_file:
+            try:
+                file_name = basename(handle.name).replace(".fsa", "")
+            except AttributeError:
+                file_name = ""
+
+            sample_id = _get_string_tag(raw.get("LIMS1"), sample_id)
+            description = _get_string_tag(raw.get("CTID1"), "")
+            record = SeqRecord(
+                Seq(""),
+                id=sample_id,
+                name=file_name,
+                description=description,
+                annotations=annot,
+            )
+
+        else:
+            # use the file name as SeqRecord.name if available
+            try:
+                file_name = basename(handle.name).replace(".ab1", "")
+            except AttributeError:
+                file_name = ""
+            record = SeqRecord(
+                Seq(seq),
+                id=sample_id,
+                name=file_name,
+                description="",
+                annotations=annot,
+            )
+        if qual:
+            # Expect this to be missing for FSA files.
+            record.letter_annotations["phred_quality"] = qual
+        elif not is_fsa_file and not qual and self.trim:
+            raise ValueError(
+                "The 'abi-trim' format can not be used for files without"
+                " quality values."
+            )
+
+        if self.trim and not is_fsa_file:
+            record = _abi_trim(record)
+
+        record.annotations["molecule_type"] = "DNA"
+        yield record
+
+
+def _AbiTrimIterator(handle):
+    """Return an iterator for the Abi file format that yields trimmed SeqRecord objects (PRIVATE)."""
+    return AbiIterator(handle, trim=True)
+
+
+def _abi_parse_header(header, handle):
+    """Return directory contents (PRIVATE)."""
+    # header structure (after ABIF marker):
+    # file version, tag name, tag number,
+    # element type code, element size, number of elements
+    # data size, data offset, handle (not file handle)
+    head_elem_size = header[4]
+    head_elem_num = header[5]
+    head_offset = header[7]
+    index = 0
+
+    while index < head_elem_num:
+        start = head_offset + index * head_elem_size
+        # add directory offset to tuple
+        # to handle directories with data size <= 4 bytes
+        handle.seek(start)
+        dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + (
+            start,
+        )
+        index += 1
+        # only parse desired dirs
+        key = dir_entry[0].decode()
+        key += str(dir_entry[1])
+
+        tag_name = dir_entry[0].decode()
+        tag_number = dir_entry[1]
+        elem_code = dir_entry[2]
+        elem_num = dir_entry[4]
+        data_size = dir_entry[5]
+        data_offset = dir_entry[6]
+        tag_offset = dir_entry[8]
+        # if data size <= 4 bytes, data is stored inside tag
+        # so offset needs to be changed
+        if data_size <= 4:
+            data_offset = tag_offset + 20
+        handle.seek(data_offset)
+        data = handle.read(data_size)
+        yield tag_name, tag_number, _parse_tag_data(elem_code, elem_num, data)
+
+
+def _abi_trim(seq_record):
+    """Trims the sequence using Richard Mott's modified trimming algorithm (PRIVATE).
+
+    Arguments:
+        - seq_record - SeqRecord object to be trimmed.
+
+    Trimmed bases are determined from their segment score, which is a
+    cumulative sum of each base's score. Base scores are calculated from
+    their quality values.
+
+    More about the trimming algorithm:
+    http://www.phrap.org/phredphrap/phred.html
+    http://resources.qiagenbioinformatics.com/manuals/clcgenomicsworkbench/650/Quality_trimming.html
+    """
+    start = False  # flag for starting position of trimmed sequence
+    segment = 20  # minimum sequence length
+    trim_start = 0  # init start index
+    cutoff = 0.05  # default cutoff value for calculating base score
+
+    if len(seq_record) <= segment:
+        return seq_record
+    else:
+        # calculate base score
+        score_list = [
+            cutoff - (10 ** (qual / -10.0))
+            for qual in seq_record.letter_annotations["phred_quality"]
+        ]
+
+        # calculate cumulative score
+        # if cumulative value < 0, set it to 0
+        # first value is set to 0, because of the assumption that
+        # the first base will always be trimmed out
+        cummul_score = [0]
+        for i in range(1, len(score_list)):
+            score = cummul_score[-1] + score_list[i]
+            if score < 0:
+                cummul_score.append(0)
+            else:
+                cummul_score.append(score)
+                if not start:
+                    # trim_start = value when cumulative score is first > 0
+                    trim_start = i
+                    start = True
+
+        # trim_finish = index of highest cumulative score,
+        # marking the end of sequence segment with highest cumulative score
+        trim_finish = cummul_score.index(max(cummul_score))
+
+        return seq_record[trim_start:trim_finish]
+
+
+def _parse_tag_data(elem_code, elem_num, raw_data):
+    """Return single data value (PRIVATE).
+
+    Arguments:
+     - elem_code - What kind of data
+     - elem_num - How many data points
+     - raw_data - abi file object from which the tags would be unpacked
+
+    """
+    if elem_code in _BYTEFMT:
+        # because '>1s' unpack differently from '>s'
+        if elem_num == 1:
+            num = ""
+        else:
+            num = str(elem_num)
+        fmt = ">" + num + _BYTEFMT[elem_code]
+
+        assert len(raw_data) == struct.calcsize(fmt)
+        data = struct.unpack(fmt, raw_data)
+
+        # no need to use tuple if len(data) == 1
+        # also if data is date / time
+        if elem_code not in [10, 11] and len(data) == 1:
+            data = data[0]
+
+        # account for different data types
+        if elem_code == 2:
+            return data
+        elif elem_code == 10:
+            return str(datetime.date(*data))
+        elif elem_code == 11:
+            return str(datetime.time(*data[:3]))
+        elif elem_code == 13:
+            return bool(data)
+        elif elem_code == 18:
+            return data[1:]
+        elif elem_code == 19:
+            return data[:-1]
+        else:
+            return data
+    else:
+        return None
+
+
+if __name__ == "__main__":
+    pass
diff --git a/code/lib/Bio/SeqIO/AceIO.py b/code/lib/Bio/SeqIO/AceIO.py
new file mode 100644
index 0000000..85bb0f5
--- /dev/null
+++ b/code/lib/Bio/SeqIO/AceIO.py
@@ -0,0 +1,101 @@
+# Copyright 2008-2015 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "ace" file format.
+
+You are expected to use this module via the Bio.SeqIO functions.
+See also the Bio.Sequencing.Ace module which offers more than just accessing
+the contig consensus sequences in an ACE file as SeqRecord objects.
+"""
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Sequencing import Ace
+
+
+def AceIterator(source):
+    """Return SeqRecord objects from an ACE file.
+
+    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
+    by iterating over the file in a single pass, we are forced to ignore any
+    WA, CT, RT or WR footer tags.
+
+    Ace files include the base quality for each position, which are taken
+    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
+    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
+    letter_annotations dictionary under the "phred_quality" key.
+
+    >>> from Bio import SeqIO
+    >>> with open("Ace/consed_sample.ace") as handle:
+    ...     for record in SeqIO.parse(handle, "ace"):
+    ...         print("%s %s... %i" % (record.id, record.seq[:10], len(record)))
+    ...         print(max(record.letter_annotations["phred_quality"]))
+    Contig1 agccccgggc... 1475
+    90
+
+    However, ACE files do not include a base quality for any gaps in the
+    consensus sequence, and these are represented in Biopython with a quality
+    of zero. Using zero is perhaps misleading as there may be very strong
+    evidence to support the gap in the consensus. Previous versions of
+    Biopython therefore used None instead, but this complicated usage, and
+    prevented output of the gapped sequence as FASTQ format.
+
+    >>> from Bio import SeqIO
+    >>> with open("Ace/contig1.ace") as handle:
+    ...     for record in SeqIO.parse(handle, "ace"):
+    ...         print("%s ...%s..." % (record.id, record.seq[85:95]))
+    ...         print(record.letter_annotations["phred_quality"][85:95])
+    ...         print(max(record.letter_annotations["phred_quality"]))
+    Contig1 ...AGAGG-ATGC...
+    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
+    90
+    Contig2 ...GAATTACTAT...
+    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
+    90
+
+    """
+    for ace_contig in Ace.parse(source):
+        # Convert the ACE contig record into a SeqRecord...
+        consensus_seq_str = ace_contig.sequence
+        if "*" in consensus_seq_str:
+            # For consistency with most other file formats, map
+            # any * gaps into - gaps.
+            assert "-" not in consensus_seq_str
+            consensus_seq = Seq(consensus_seq_str.replace("*", "-"))
+        else:
+            consensus_seq = Seq(consensus_seq_str)
+
+        # TODO? - Base segments (BS lines) which indicates which read
+        # phrap has chosen to be the consensus at a particular position.
+        # Perhaps as SeqFeature objects?
+
+        # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
+        # Perhaps as SeqFeature objects?
+
+        seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name)
+
+        # Consensus base quality (BQ lines).  Note that any gaps (originally
+        # as * characters) in the consensus do not get a quality entry, so
+        # we assign a quality of None (zero would be misleading as there may
+        # be excellent support for having a gap here).
+        quals = []
+        i = 0
+        for base in consensus_seq:
+            if base == "-":
+                quals.append(0)
+            else:
+                quals.append(ace_contig.quality[i])
+                i += 1
+        assert i == len(ace_contig.quality)
+        seq_record.letter_annotations["phred_quality"] = quals
+
+        yield seq_record
+    # All done
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqIO/FastaIO.py b/code/lib/Bio/SeqIO/FastaIO.py
new file mode 100644
index 0000000..73abcd4
--- /dev/null
+++ b/code/lib/Bio/SeqIO/FastaIO.py
@@ -0,0 +1,426 @@
+# Copyright 2006-2017,2020 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# This module is for reading and writing FASTA format files as SeqRecord
+# objects.  The code is partly inspired  by earlier Biopython modules,
+# Bio.Fasta.* and the now removed module Bio.SeqIO.FASTA
+"""Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format.
+
+You are expected to use this module via the Bio.SeqIO functions.
+"""
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import _clean
+from .Interfaces import _get_seq_string
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+def SimpleFastaParser(handle):
+    """Iterate over Fasta records as string tuples.
+
+    Arguments:
+     - handle - input stream opened in text mode
+
+    For each record a tuple of two strings is returned, the FASTA title
+    line (without the leading '>' character), and the sequence (with any
+    whitespace removed). The title line is not divided up into an
+    identifier (the first word) and comment or description.
+
+    >>> with open("Fasta/dups.fasta") as handle:
+    ...     for values in SimpleFastaParser(handle):
+    ...         print(values)
+    ...
+    ('alpha', 'ACGTA')
+    ('beta', 'CGTC')
+    ('gamma', 'CCGCC')
+    ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA')
+    ('delta', 'CGCGC')
+
+    """
+    # Skip any text before the first record (e.g. blank lines, comments)
+    for line in handle:
+        if line[0] == ">":
+            title = line[1:].rstrip()
+            break
+    else:
+        # no break encountered - probably an empty file
+        return
+
+    # Main logic
+    # Note, remove trailing whitespace, and any internal spaces
+    # (and any embedded \r which are possible in mangled files
+    # when not opened in universal read lines mode)
+    lines = []
+    for line in handle:
+        if line[0] == ">":
+            yield title, "".join(lines).replace(" ", "").replace("\r", "")
+            lines = []
+            title = line[1:].rstrip()
+            continue
+        lines.append(line.rstrip())
+
+    yield title, "".join(lines).replace(" ", "").replace("\r", "")
+
+
+def FastaTwoLineParser(handle):
+    """Iterate over no-wrapping Fasta records as string tuples.
+
+    Arguments:
+     - handle - input stream opened in text mode
+
+    Functionally the same as SimpleFastaParser but with a strict
+    interpretation of the FASTA format as exactly two lines per
+    record, the greater-than-sign identifier with description,
+    and the sequence with no line wrapping.
+
+    Any line wrapping will raise an exception, as will excess blank
+    lines (other than the special case of a zero-length sequence
+    as the second line of a record).
+
+    Examples
+    --------
+    This file uses two lines per FASTA record:
+
+    >>> with open("Fasta/aster_no_wrap.pro") as handle:
+    ...     for title, seq in FastaTwoLineParser(handle):
+    ...         print("%s = %s..." % (title, seq[:3]))
+    ...
+    gi|3298468|dbj|BAA31520.1| SAMIPF = GGH...
+
+    This equivalent file uses line wrapping:
+
+    >>> with open("Fasta/aster.pro") as handle:
+    ...     for title, seq in FastaTwoLineParser(handle):
+    ...         print("%s = %s..." % (title, seq[:3]))
+    ...
+    Traceback (most recent call last):
+       ...
+    ValueError: Expected FASTA record starting with '>' character. Perhaps this file is using FASTA line wrapping? Got: 'MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI'
+
+    """
+    idx = -1  # for empty file
+    for idx, line in enumerate(handle):
+        if idx % 2 == 0:  # title line
+            if line[0] != ">":
+                raise ValueError(
+                    "Expected FASTA record starting with '>' character. "
+                    "Perhaps this file is using FASTA line wrapping? "
+                    f"Got: '{line}'"
+                )
+            title = line[1:].rstrip()
+        else:  # sequence line
+            if line[0] == ">":
+                raise ValueError(
+                    "Two '>' FASTA lines in a row. Missing sequence line "
+                    "if this is strict two-line-per-record FASTA format. "
+                    f"Have '>{title}' and '{line}'"
+                )
+            yield title, line.strip()
+
+    if idx == -1:
+        pass  # empty file
+    elif idx % 2 == 0:  # on a title line
+        raise ValueError(
+            "Missing sequence line at end of file if this is strict "
+            f"two-line-per-record FASTA format. Have title line '{line}'"
+        )
+    else:
+        assert line[0] != ">", "line[0] == '>' ; this should be impossible!"
+
+
+class FastaIterator(SequenceIterator):
+    """Parser for Fasta files."""
+
+    def __init__(self, source, alphabet=None, title2ids=None):
+        """Iterate over Fasta records as SeqRecord objects.
+
+        Arguments:
+         - source - input stream opened in text mode, or a path to a file
+         - alphabet - optional alphabet, not used. Leave as None.
+         - title2ids - A function that, when given the title of the FASTA
+           file (without the beginning >), will return the id, name and
+           description (in that order) for the record as a tuple of strings.
+           If this is not given, then the entire title line will be used
+           as the description, and the first word as the id and name.
+
+        By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
+        with no custom handling of the title lines:
+
+        >>> with open("Fasta/dups.fasta") as handle:
+        ...     for record in FastaIterator(handle):
+        ...         print(record.id)
+        ...
+        alpha
+        beta
+        gamma
+        alpha
+        delta
+
+        However, you can supply a title2ids function to alter this:
+
+        >>> def take_upper(title):
+        ...     return title.split(None, 1)[0].upper(), "", title
+        >>> with open("Fasta/dups.fasta") as handle:
+        ...     for record in FastaIterator(handle, title2ids=take_upper):
+        ...         print(record.id)
+        ...
+        ALPHA
+        BETA
+        GAMMA
+        ALPHA
+        DELTA
+
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        self.title2ids = title2ids
+        super().__init__(source, mode="t", fmt="Fasta")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        title2ids = self.title2ids
+        if title2ids:
+            for title, sequence in SimpleFastaParser(handle):
+                id, name, descr = title2ids(title)
+                yield SeqRecord(Seq(sequence), id=id, name=name, description=descr)
+        else:
+            for title, sequence in SimpleFastaParser(handle):
+                try:
+                    first_word = title.split(None, 1)[0]
+                except IndexError:
+                    assert not title, repr(title)
+                    # Should we use SeqRecord default for no ID?
+                    first_word = ""
+                yield SeqRecord(
+                    Seq(sequence), id=first_word, name=first_word, description=title,
+                )
+
+
+class FastaTwoLineIterator(SequenceIterator):
+    """Parser for Fasta files with exactly two lines per record."""
+
+    def __init__(self, source):
+        """Iterate over two-line Fasta records (as SeqRecord objects).
+
+        Arguments:
+         - source - input stream opened in text mode, or a path to a file
+
+        This uses a strict interpretation of the FASTA as requiring
+        exactly two lines per record (no line wrapping).
+
+        Only the default title to ID/name/description parsing offered
+        by the relaxed FASTA parser is offered.
+        """
+        super().__init__(source, mode="t", fmt="FASTA")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        for title, sequence in FastaTwoLineParser(handle):
+            try:
+                first_word = title.split(None, 1)[0]
+            except IndexError:
+                assert not title, repr(title)
+                # Should we use SeqRecord default for no ID?
+                first_word = ""
+            yield SeqRecord(
+                Seq(sequence), id=first_word, name=first_word, description=title,
+            )
+
+
+class FastaWriter(SequenceWriter):
+    """Class to write Fasta format files (OBSOLETE).
+
+    Please use the ``as_fasta`` function instead, or the top level
+    ``Bio.SeqIO.write()`` function instead using ``format="fasta"``.
+    """
+
+    def __init__(self, target, wrap=60, record2title=None):
+        """Create a Fasta writer (OBSOLETE).
+
+        Arguments:
+         - target - Output stream opened in text mode, or a path to a file.
+         - wrap -   Optional line length used to wrap sequence lines.
+           Defaults to wrapping the sequence at 60 characters
+           Use zero (or None) for no wrapping, giving a single
+           long line for the sequence.
+         - record2title - Optional function to return the text to be
+           used for the title line of each record.  By default
+           a combination of the record.id and record.description
+           is used.  If the record.description starts with the
+           record.id, then just the record.description is used.
+
+        You can either use::
+
+            handle = open(filename, "w")
+            writer = FastaWriter(handle)
+            writer.write_file(myRecords)
+            handle.close()
+
+        Or, follow the sequential file writer system, for example::
+
+            handle = open(filename, "w")
+            writer = FastaWriter(handle)
+            writer.write_header() # does nothing for Fasta files
+            ...
+            Multiple writer.write_record() and/or writer.write_records() calls
+            ...
+            writer.write_footer() # does nothing for Fasta files
+            handle.close()
+
+        """
+        super().__init__(target)
+        if wrap:
+            if wrap < 1:
+                raise ValueError
+        self.wrap = wrap
+        self.record2title = record2title
+
+    def write_record(self, record):
+        """Write a single Fasta record to the file."""
+        if self.record2title:
+            title = self.clean(self.record2title(record))
+        else:
+            id = self.clean(record.id)
+            description = self.clean(record.description)
+            if description and description.split(None, 1)[0] == id:
+                # The description includes the id at the start
+                title = description
+            elif description:
+                title = "%s %s" % (id, description)
+            else:
+                title = id
+
+        assert "\n" not in title
+        assert "\r" not in title
+        self.handle.write(">%s\n" % title)
+
+        data = _get_seq_string(record)  # Catches sequence being None
+
+        assert "\n" not in data
+        assert "\r" not in data
+
+        if self.wrap:
+            for i in range(0, len(data), self.wrap):
+                self.handle.write(data[i : i + self.wrap] + "\n")
+        else:
+            self.handle.write(data + "\n")
+
+
+class FastaTwoLineWriter(FastaWriter):
+    """Class to write 2-line per record Fasta format files (OBSOLETE).
+
+    This means we write the sequence information  without line
+    wrapping, and will always write a blank line for an empty
+    sequence.
+
+    Please use the ``as_fasta_2line`` function instead, or the top level
+    ``Bio.SeqIO.write()`` function instead using ``format="fasta"``.
+    """
+
+    def __init__(self, handle, record2title=None):
+        """Create a 2-line per record Fasta writer (OBSOLETE).
+
+        Arguments:
+         - handle - Handle to an output file, e.g. as returned
+           by open(filename, "w")
+         - record2title - Optional function to return the text to be
+           used for the title line of each record.  By default
+           a combination of the record.id and record.description
+           is used.  If the record.description starts with the
+           record.id, then just the record.description is used.
+
+        You can either use::
+
+            handle = open(filename, "w")
+            writer = FastaWriter(handle)
+            writer.write_file(myRecords)
+            handle.close()
+
+        Or, follow the sequential file writer system, for example::
+
+            handle = open(filename, "w")
+            writer = FastaWriter(handle)
+            writer.write_header() # does nothing for Fasta files
+            ...
+            Multiple writer.write_record() and/or writer.write_records() calls
+            ...
+            writer.write_footer() # does nothing for Fasta files
+            handle.close()
+
+        """
+        super().__init__(handle, wrap=None, record2title=record2title)
+
+
+def as_fasta(record):
+    """Turn a SeqRecord into a FASTA formatted string.
+
+    This is used internally by the SeqRecord's .format("fasta")
+    method and by the SeqIO.write(..., ..., "fasta") function.
+    """
+    id = _clean(record.id)
+    description = _clean(record.description)
+    if description and description.split(None, 1)[0] == id:
+        # The description includes the id at the start
+        title = description
+    elif description:
+        title = "%s %s" % (id, description)
+    else:
+        title = id
+    assert "\n" not in title
+    assert "\r" not in title
+    lines = [">%s\n" % title]
+
+    data = _get_seq_string(record)  # Catches sequence being None
+    assert "\n" not in data
+    assert "\r" not in data
+    for i in range(0, len(data), 60):
+        lines.append(data[i : i + 60] + "\n")
+
+    return "".join(lines)
+
+
+def as_fasta_2line(record):
+    """Turn a SeqRecord into a two-line FASTA formatted string.
+
+    This is used internally by the SeqRecord's .format("fasta-2line")
+    method and by the SeqIO.write(..., ..., "fasta-2line") function.
+    """
+    id = _clean(record.id)
+    description = _clean(record.description)
+    if description and description.split(None, 1)[0] == id:
+        # The description includes the id at the start
+        title = description
+    elif description:
+        title = "%s %s" % (id, description)
+    else:
+        title = id
+    assert "\n" not in title
+    assert "\r" not in title
+
+    data = _get_seq_string(record)  # Catches sequence being None
+    assert "\n" not in data
+    assert "\r" not in data
+
+    return ">%s\n%s\n" % (title, data)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/GckIO.py b/code/lib/Bio/SeqIO/GckIO.py
new file mode 100644
index 0000000..2fa48eb
--- /dev/null
+++ b/code/lib/Bio/SeqIO/GckIO.py
@@ -0,0 +1,230 @@
+# Copyright 2019 Damien Goutte-Gattat.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "gck" file format.
+
+The GCK binary format is generated by the Gene Construction Kit software
+from Textco BioSoftware, Inc.
+"""
+from struct import unpack
+
+from Bio.Seq import Seq
+from Bio.SeqFeature import FeatureLocation
+from Bio.SeqFeature import SeqFeature
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+
+
+def _read(handle, length):
+    """Read the specified number of bytes from the given handle."""
+    data = handle.read(length)
+    if len(data) < length:
+        raise ValueError(f"Cannot read {length} bytes from handle")
+    return data
+
+
+def _read_packet(handle):
+    """Read a length-prefixed packet.
+
+    Parts of a GCK file are made of "packets" comprising of 4 bytes
+    giving the packet's size, followed by the packet's data.
+
+    There is no type tag. The type of a packet, and thus the type of data
+    it contains, is solely indicated by the position of the packet within
+    the GCK file.
+    """
+    length = _read(handle, 4)
+    length = unpack(">I", length)[0]
+    data = _read(handle, length)
+    return (data, length)
+
+
+def _read_pstring(handle):
+    """Read a Pascal string.
+
+    A Pascal string is one byte for length followed by the actual string.
+    """
+    length = _read(handle, 1)
+    length = unpack(">B", length)[0]
+    data = _read(handle, length).decode("ASCII")
+    return data
+
+
+def _read_p4string(handle):
+    """Read a 32-bit Pascal string.
+
+    Similar to a Pascal string but length is encoded on 4 bytes.
+    """
+    length = _read(handle, 4)
+    length = unpack(">I", length)[0]
+    data = _read(handle, length).decode("ASCII")
+    return data
+
+
+def _parse(handle):
+    # Skip file header
+    # GCK files start with a 24-bytes header. Bytes 4 and 8 seem to
+    # always be 12, maybe this could act as a magic cookie. Bytes
+    # 17-20 and 21-24 contain variable values of unknown meaning.
+    # check if file is empty
+    data = handle.read(24)
+    if not data:
+        raise ValueError("Empty file.")
+    if len(data) < 24:
+        raise ValueError("Improper header, cannot read 24 bytes from handle")
+    # Read the actual sequence data
+    packet, length = _read_packet(handle)
+    # The body of the sequence packet starts with a 32-bit integer
+    # representing the length of the sequence.
+    seq_length = unpack(">I", packet[:4])[0]
+    # This length should not be larger than the length of the
+    # sequence packet.
+    if seq_length > length - 4:
+        raise ValueError("Conflicting sequence length values")
+    sequence = packet[4:].decode("ASCII")
+    record = SeqRecord(Seq(sequence))
+
+    # Skip unknown packet
+    _read_packet(handle)
+
+    # Read features packet
+    packet, length = _read_packet(handle)
+    (seq_length, num_features) = unpack(">IH", packet[:6])
+    # Check that length in the features packet matches the actual
+    # length of the sequence
+    if seq_length != len(sequence):
+        raise ValueError("Conflicting sequence length values")
+    # Each feature is stored in a 92-bytes structure.
+    if length - 6 != num_features * 92:
+        raise ValueError("Features packet size inconsistent with number of features")
+    for i in range(0, num_features):
+        offset = 6 + i * 92
+        feature_data = packet[offset : offset + 92]
+
+        # There's probably more stuff to unpack in that structure,
+        # but those values are the only ones I understand.
+        (start, end, type, strand, has_name, has_comment, version) = unpack(
+            ">II6xH14xB17xII35xB", feature_data
+        )
+
+        if strand == 1:  # Reverse strand
+            strand = -1
+        else:
+            # Other possible values are 0 (no strand specified),
+            # 2 (forward strand), and 3 (both strands). All are
+            # treated as a forward strand.
+            strand = 1
+        location = FeatureLocation(start, end, strand=strand)
+
+        # It looks like any value > 0 indicates a CDS...
+        if type > 0:
+            type = "CDS"
+        else:
+            type = "misc_feature"
+
+        # Each feature may have a name and a comment, which are then
+        # stored immediately after the features packet. Names are
+        # stored as Pascal strings (1 length byte followed by the
+        # string itself), comments are stored as "32-bit Pascal strings"
+        # (4 length bytes followed by the string).
+        qualifiers = {}
+        if has_name > 0:
+            name = _read_pstring(handle)
+            qualifiers["label"] = [name]
+        if has_comment > 0:
+            comment = _read_p4string(handle)
+            qualifiers["note"] = [comment]
+
+        # Each feature may exist in several "versions". We keep only
+        # the most recent version.
+        if version > 0:
+            continue
+
+        feature = SeqFeature(location, type=type, qualifiers=qualifiers)
+        record.features.append(feature)
+
+    # Read restriction sites packet
+    # We are not interested in restriction sites, but we must still read
+    # that packet so that we can skip the names and comments for each
+    # site, which are stored after that packet in a similar way as for
+    # the features above.
+    packet, length = _read_packet(handle)
+    (seq_length, num_sites) = unpack(">IH", packet[:6])
+    # Each site is stored in a 88-bytes structure
+    if length - 6 != num_sites * 88:
+        raise ValueError("Sites packet size inconsistent with number of sites")
+    for i in range(0, num_sites):
+        offset = 6 + i * 88
+        site_data = packet[offset : offset + 88]
+
+        (start, end, has_name, has_comment) = unpack(">II24xII48x", site_data)
+
+        # Skip names and comments
+        if has_name:
+            _read_pstring(handle)
+        if has_comment:
+            _read_p4string(handle)
+
+    # Skip unknown packet
+    _read_packet(handle)
+
+    # Next in the file are "version packets".
+    # However they are not properly formatted "packets" as they are not
+    # preceded by an integer giving their size. Instead we have a
+    # short integer indicating how many versions are there, and then
+    # as many 260-bytes block as we have versions.
+    num_versions = _read(handle, 2)
+    num_versions = unpack(">H", num_versions)[0]
+    versions = _read(handle, num_versions * 260)
+    for i in range(0, num_versions):
+        offset = i * 260
+        version_data = versions[offset : offset + 260]
+
+        # Each version may have a comment, which is then stored
+        # after all the "version packets".
+        has_comment = unpack(">I", version_data[-4:])[0]
+        if has_comment > 0:
+            _read_p4string(handle)
+
+    # Skip unknown fixed-size block
+    # Whatever this block contains, it is not preceded by any length
+    # indicator, so I hope its size is indeed constant in all files...
+    _read(handle, 706)
+
+    # Read the construct's name
+    name = _read_pstring(handle)
+    record.name = record.id = name.split(" ")[0]
+    record.description = name
+
+    # Circularity byte
+    # There may be other flags in that block, but their meaning
+    # is unknown to me.
+    flags = _read(handle, 17)
+    circularity = unpack(">16xB", flags)[0]
+    if circularity > 0:
+        record.annotations["topology"] = "circular"
+    else:
+        record.annotations["topology"] = "linear"
+
+    yield record
+
+
+class GckIterator(SequenceIterator):
+    """Parser for GCK files."""
+
+    def __init__(self, source):
+        """Break up a GCK file into SeqRecord objects."""
+        super().__init__(source, mode="b", fmt="GCK")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator.
+
+        Note that a GCK file can only contain one sequence, so this
+        iterator will always return a single record.
+        """
+        records = _parse(handle)
+        return records
diff --git a/code/lib/Bio/SeqIO/IgIO.py b/code/lib/Bio/SeqIO/IgIO.py
new file mode 100644
index 0000000..6921172
--- /dev/null
+++ b/code/lib/Bio/SeqIO/IgIO.py
@@ -0,0 +1,128 @@
+# Copyright 2008-2015 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
+
+This module is for reading and writing IntelliGenetics format files as
+SeqRecord objects.  This file format appears to be the same as the MASE
+multiple sequence alignment format.
+
+You are expected to use this module via the Bio.SeqIO functions.
+"""
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+
+
+class IgIterator(SequenceIterator):
+    """Parser for IntelliGenetics files."""
+
+    def __init__(self, source):
+        """Iterate over IntelliGenetics records (as SeqRecord objects).
+
+        source - file-like object opened in text mode, or a path to a file
+
+        The optional free format file header lines (which start with two
+        semi-colons) are ignored.
+
+        The free format commentary lines at the start of each record (which
+        start with a semi-colon) are recorded as a single string with embedded
+        new line characters in the SeqRecord's annotations dictionary under the
+        key 'comment'.
+
+        Examples
+        --------
+        >>> with open("IntelliGenetics/TAT_mase_nuc.txt") as handle:
+        ...     for record in IgIterator(handle):
+        ...         print("%s length %i" % (record.id, len(record)))
+        ...
+        A_U455 length 303
+        B_HXB2R length 306
+        C_UG268A length 267
+        D_ELI length 309
+        F_BZ163A length 309
+        O_ANT70 length 342
+        O_MVP5180 length 348
+        CPZGAB length 309
+        CPZANT length 309
+        A_ROD length 390
+        B_EHOA length 420
+        D_MM251 length 390
+        STM_STM length 387
+        VER_AGM3 length 354
+        GRI_AGM677 length 264
+        SAB_SAB1C length 219
+        SYK_SYK length 330
+
+        """
+        super().__init__(source, mode="t", fmt="IntelliGenetics")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Iterate over the records in the IntelliGenetics file."""
+        # Skip any file header text before the first record (;; lines)
+        for line in handle:
+            if not line.startswith(";;"):
+                break
+        else:
+            # Empty file, or header only
+            return
+
+        if line[0] != ";":
+            raise ValueError("Records should start with ';' and not:\n%r" % line)
+        while line:
+            # Now iterate over the records
+
+            # Try and agree with SeqRecord convention from the GenBank parser,
+            # (and followed in the SwissProt parser) which stores the comments
+            # as a long string with newlines under annotations key 'comment'.
+
+            # Note some examples use "; ..." and others ";..."
+            comment_lines = []
+            while line.startswith(";"):
+                # TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
+                comment_lines.append(line[1:].strip())
+                line = next(handle)
+            title = line.rstrip()
+
+            seq_lines = []
+            for line in handle:
+                if line[0] == ";":
+                    break
+                # Remove trailing whitespace, and any internal spaces
+                seq_lines.append(line.rstrip().replace(" ", ""))
+            else:
+                line = None
+            seq_str = "".join(seq_lines)
+            if seq_str.endswith("1"):
+                # Remove the optional terminator (digit one)
+                seq_str = seq_str[:-1]
+            if "1" in seq_str:
+                raise ValueError(
+                    "Potential terminator digit one found within sequence."
+                )
+
+            # Return the record and then continue...
+            yield SeqRecord(
+                Seq(seq_str),
+                id=title,
+                name=title,
+                annotations={"comment": "\n".join(comment_lines)},
+            )
+
+        # We should be at the end of the file now
+        assert not line
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/InsdcIO.py b/code/lib/Bio/SeqIO/InsdcIO.py
new file mode 100644
index 0000000..fd6c079
--- /dev/null
+++ b/code/lib/Bio/SeqIO/InsdcIO.py
@@ -0,0 +1,1511 @@
+# Copyright 2007-2016 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "genbank" and "embl" file formats.
+
+You are expected to use this module via the Bio.SeqIO functions.
+Note that internally this module calls Bio.GenBank to do the actual
+parsing of GenBank, EMBL and IMGT files.
+
+See Also:
+International Nucleotide Sequence Database Collaboration
+http://www.insdc.org/
+
+GenBank
+http://www.ncbi.nlm.nih.gov/Genbank/
+
+EMBL Nucleotide Sequence Database
+http://www.ebi.ac.uk/embl/
+
+DDBJ (DNA Data Bank of Japan)
+http://www.ddbj.nig.ac.jp/
+
+IMGT (use a variant of EMBL format with longer feature indents)
+http://imgt.cines.fr/download/LIGM-DB/userman_doc.html
+http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html
+http://www.ebi.ac.uk/imgt/hla/docs/manual.html
+
+"""
+import warnings
+
+from datetime import datetime
+
+from Bio import BiopythonWarning
+from Bio import SeqFeature
+from Bio import SeqIO
+from Bio.GenBank.Scanner import _ImgtScanner
+from Bio.GenBank.Scanner import EmblScanner
+from Bio.GenBank.Scanner import GenBankScanner
+from Bio.Seq import UndefinedSequenceError
+from Bio.Seq import UnknownSeq
+
+from .Interfaces import _get_seq_string
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+# NOTE
+# ====
+# The "brains" for parsing GenBank, EMBL and IMGT files (and any
+# other flat file variants from the INSDC in future) is in
+# Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank)
+# However, all the writing code is in this file.
+
+
+class GenBankIterator(SequenceIterator):
+    """Parser for GenBank files."""
+
+    def __init__(self, source):
+        """Break up a Genbank file into SeqRecord objects.
+
+        Argument source is a file-like object opened in text mode or a path to a file.
+        Every section from the LOCUS line to the terminating // becomes
+        a single SeqRecord with associated annotation and features.
+
+        Note that for genomes or chromosomes, there is typically only
+        one record.
+
+        This gets called internally by Bio.SeqIO for the GenBank file format:
+
+        >>> from Bio import SeqIO
+        >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"):
+        ...     print(record.id)
+        ...
+        X55053.1
+        X62281.1
+        M81224.1
+        AJ237582.1
+        L31939.1
+        AF297471.1
+
+        Equivalently,
+
+        >>> with open("GenBank/cor6_6.gb") as handle:
+        ...     for record in GenBankIterator(handle):
+        ...         print(record.id)
+        ...
+        X55053.1
+        X62281.1
+        M81224.1
+        AJ237582.1
+        L31939.1
+        AF297471.1
+
+        """
+        super().__init__(source, mode="t", fmt="GenBank")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = GenBankScanner(debug=0).parse_records(handle)
+        return records
+
+
+class EmblIterator(SequenceIterator):
+    """Parser for EMBL files."""
+
+    def __init__(self, source):
+        """Break up an EMBL file into SeqRecord objects.
+
+        Argument source is a file-like object opened in text mode or a path to a file.
+        Every section from the LOCUS line to the terminating // becomes
+        a single SeqRecord with associated annotation and features.
+
+        Note that for genomes or chromosomes, there is typically only
+        one record.
+
+        This gets called internally by Bio.SeqIO for the EMBL file format:
+
+        >>> from Bio import SeqIO
+        >>> for record in SeqIO.parse("EMBL/epo_prt_selection.embl", "embl"):
+        ...     print(record.id)
+        ...
+        A00022.1
+        A00028.1
+        A00031.1
+        A00034.1
+        A00060.1
+        A00071.1
+        A00072.1
+        A00078.1
+        CQ797900.1
+
+        Equivalently,
+
+        >>> with open("EMBL/epo_prt_selection.embl") as handle:
+        ...     for record in EmblIterator(handle):
+        ...         print(record.id)
+        ...
+        A00022.1
+        A00028.1
+        A00031.1
+        A00034.1
+        A00060.1
+        A00071.1
+        A00072.1
+        A00078.1
+        CQ797900.1
+
+        """
+        super().__init__(source, mode="t", fmt="EMBL")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = EmblScanner(debug=0).parse_records(handle)
+        return records
+
+
+class ImgtIterator(SequenceIterator):
+    """Parser for IMGT files."""
+
+    def __init__(self, source):
+        """Break up an IMGT file into SeqRecord objects.
+
+        Argument source is a file-like object opened in text mode or a path to a file.
+        Every section from the LOCUS line to the terminating // becomes
+        a single SeqRecord with associated annotation and features.
+
+        Note that for genomes or chromosomes, there is typically only
+        one record.
+        """
+        super().__init__(source, mode="t", fmt="IMGT")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = _ImgtScanner(debug=0).parse_records(handle)
+        return records
+
+
+class GenBankCdsFeatureIterator(SequenceIterator):
+    """Parser for GenBank files, creating a SeqRecord for each CDS feature."""
+
+    def __init__(self, source):
+        """Break up a Genbank file into SeqRecord objects for each CDS feature.
+
+        Argument source is a file-like object opened in text mode or a path to a file.
+
+        Every section from the LOCUS line to the terminating // can contain
+        many CDS features.  These are returned as with the stated amino acid
+        translation sequence (if given).
+        """
+        super().__init__(source, mode="t", fmt="GenBank")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        return GenBankScanner(debug=0).parse_cds_features(handle)
+
+
+class EmblCdsFeatureIterator(SequenceIterator):
+    """Parser for EMBL files, creating a SeqRecord for each CDS feature."""
+
+    def __init__(self, source):
+        """Break up a EMBL file into SeqRecord objects for each CDS feature.
+
+        Argument source is a file-like object opened in text mode or a path to a file.
+
+        Every section from the LOCUS line to the terminating // can contain
+        many CDS features.  These are returned as with the stated amino acid
+        translation sequence (if given).
+        """
+        super().__init__(source, mode="t", fmt="EMBL")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        return EmblScanner(debug=0).parse_cds_features(handle)
+
+
+def _insdc_feature_position_string(pos, offset=0):
+    """Build a GenBank/EMBL position string (PRIVATE).
+
+    Use offset=1 to add one to convert a start position from python counting.
+    """
+    if isinstance(pos, SeqFeature.ExactPosition):
+        return "%i" % (pos.position + offset)
+    elif isinstance(pos, SeqFeature.WithinPosition):
+        return "(%i.%i)" % (
+            pos.position + offset,
+            pos.position + pos.extension + offset,
+        )
+    elif isinstance(pos, SeqFeature.BetweenPosition):
+        return "(%i^%i)" % (
+            pos.position + offset,
+            pos.position + pos.extension + offset,
+        )
+    elif isinstance(pos, SeqFeature.BeforePosition):
+        return "<%i" % (pos.position + offset)
+    elif isinstance(pos, SeqFeature.AfterPosition):
+        return ">%i" % (pos.position + offset)
+    elif isinstance(pos, SeqFeature.OneOfPosition):
+        return "one-of(%s)" % ",".join(
+            _insdc_feature_position_string(p, offset) for p in pos.position_choices
+        )
+    elif isinstance(pos, SeqFeature.AbstractPosition):
+        raise NotImplementedError("Please report this as a bug in Biopython.")
+    else:
+        raise ValueError("Expected a SeqFeature position object.")
+
+
+def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
+    if location.ref:
+        ref = "%s:" % location.ref
+    else:
+        ref = ""
+    assert not location.ref_db
+    if (
+        isinstance(location.start, SeqFeature.ExactPosition)
+        and isinstance(location.end, SeqFeature.ExactPosition)
+        and location.start.position == location.end.position
+    ):
+        # Special case, for 12:12 return 12^13
+        # (a zero length slice, meaning the point between two letters)
+        if location.end.position == rec_length:
+            # Very special case, for a between position at the end of a
+            # sequence (used on some circular genomes, Bug 3098) we have
+            # N:N so return N^1
+            return "%s%i^1" % (ref, rec_length)
+        else:
+            return "%s%i^%i" % (ref, location.end.position, location.end.position + 1)
+    if (
+        isinstance(location.start, SeqFeature.ExactPosition)
+        and isinstance(location.end, SeqFeature.ExactPosition)
+        and location.start.position + 1 == location.end.position
+    ):
+        # Special case, for 11:12 return 12 rather than 12..12
+        # (a length one slice, meaning a single letter)
+        return "%s%i" % (ref, location.end.position)
+    elif isinstance(location.start, SeqFeature.UnknownPosition) or isinstance(
+        location.end, SeqFeature.UnknownPosition
+    ):
+        # Special case for features from SwissProt/UniProt files
+        if isinstance(location.start, SeqFeature.UnknownPosition) and isinstance(
+            location.end, SeqFeature.UnknownPosition
+        ):
+            # warnings.warn("Feature with unknown location", BiopythonWarning)
+            # return "?"
+            raise ValueError("Feature with unknown location")
+        elif isinstance(location.start, SeqFeature.UnknownPosition):
+            # Treat the unknown start position as a BeforePosition
+            return "%s<%i..%s" % (
+                ref,
+                location.nofuzzy_end,
+                _insdc_feature_position_string(location.end),
+            )
+        else:
+            # Treat the unknown end position as an AfterPosition
+            return "%s%s..>%i" % (
+                ref,
+                _insdc_feature_position_string(location.start, +1),
+                location.nofuzzy_start + 1,
+            )
+    else:
+        # Typical case, e.g. 12..15 gets mapped to 11:15
+        return (
+            ref
+            + _insdc_feature_position_string(location.start, +1)
+            + ".."
+            + _insdc_feature_position_string(location.end)
+        )
+
+
+def _insdc_location_string(location, rec_length):
+    """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE).
+
+    There is a choice of how to show joins on the reverse complement strand,
+    GenBank used "complement(join(1,10),(20,100))" while EMBL used to use
+    "join(complement(20,100),complement(1,10))" instead (but appears to have
+    now adopted the GenBank convention). Notice that the order of the entries
+    is reversed! This function therefore uses the first form. In this situation
+    we expect the CompoundFeatureLocation and its parts to all be marked as
+    strand == -1, and to be in the order 19:100 then 0:10.
+    """
+    try:
+        parts = location.parts
+        # CompoundFeatureLocation
+        if location.strand == -1:
+            # Special case, put complement outside the join/order/... and reverse order
+            return "complement(%s(%s))" % (
+                location.operator,
+                ",".join(
+                    _insdc_location_string_ignoring_strand_and_subfeatures(
+                        p, rec_length
+                    )
+                    for p in parts[::-1]
+                ),
+            )
+        else:
+            return "%s(%s)" % (
+                location.operator,
+                ",".join(_insdc_location_string(p, rec_length) for p in parts),
+            )
+    except AttributeError:
+        # Simple FeatureLocation
+        loc = _insdc_location_string_ignoring_strand_and_subfeatures(
+            location, rec_length
+        )
+        if location.strand == -1:
+            return "complement(%s)" % loc
+        else:
+            return loc
+
+
+class _InsdcWriter(SequenceWriter):
+    """Base class for GenBank and EMBL writers (PRIVATE)."""
+
+    MAX_WIDTH = 80
+    QUALIFIER_INDENT = 21
+    QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT
+    QUALIFIER_INDENT_TMP = "     %s                "  # 21 if %s is empty
+    FTQUAL_NO_QUOTE = (
+        "anticodon",
+        "citation",
+        "codon_start",
+        "compare",
+        "direction",
+        "estimated_length",
+        "mod_base",
+        "number",
+        "rpt_type",
+        "rpt_unit_range",
+        "tag_peptide",
+        "transl_except",
+        "transl_table",
+    )
+
+    def _write_feature_qualifier(self, key, value=None, quote=None):
+        if value is None:
+            # Value-less entry like /pseudo
+            self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key))
+            return
+
+        if type(value) == str:
+            value = value.replace(
+                '"', '""'
+            )  # NCBI says escape " as "" in qualifier values
+
+        # Quick hack with no line wrapping, may be useful for testing:
+        # self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value))
+        if quote is None:
+            # Try to mimic unwritten rules about when quotes can be left out:
+            if isinstance(value, int) or key in self.FTQUAL_NO_QUOTE:
+                quote = False
+            else:
+                quote = True
+        if quote:
+            line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value)
+        else:
+            line = "%s/%s=%s" % (self.QUALIFIER_INDENT_STR, key, value)
+        if len(line) <= self.MAX_WIDTH:
+            self.handle.write(line + "\n")
+            return
+        while line.lstrip():
+            if len(line) <= self.MAX_WIDTH:
+                self.handle.write(line + "\n")
+                return
+            # Insert line break...
+            for index in range(
+                min(len(line) - 1, self.MAX_WIDTH), self.QUALIFIER_INDENT + 1, -1
+            ):
+                if line[index] == " ":
+                    break
+            if line[index] != " ":
+                # No nice place to break...
+                index = self.MAX_WIDTH
+            assert index <= self.MAX_WIDTH
+            self.handle.write(line[:index] + "\n")
+            line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
+
+    def _wrap_location(self, location):
+        """Split a feature location into lines (break at commas) (PRIVATE)."""
+        # TODO - Rewrite this not to recurse!
+        length = self.MAX_WIDTH - self.QUALIFIER_INDENT
+        if len(location) <= length:
+            return location
+        index = location[:length].rfind(",")
+        if index == -1:
+            # No good place to split (!)
+            warnings.warn("Couldn't split location:\n%s" % location, BiopythonWarning)
+            return location
+        return (
+            location[: index + 1]
+            + "\n"
+            + self.QUALIFIER_INDENT_STR
+            + self._wrap_location(location[index + 1 :])
+        )
+
+    def _write_feature(self, feature, record_length):
+        """Write a single SeqFeature object to features table (PRIVATE)."""
+        assert feature.type, feature
+        location = _insdc_location_string(feature.location, record_length)
+        f_type = feature.type.replace(" ", "_")
+        line = (
+            (self.QUALIFIER_INDENT_TMP % f_type)[: self.QUALIFIER_INDENT]
+            + self._wrap_location(location)
+            + "\n"
+        )
+        self.handle.write(line)
+        # Now the qualifiers...
+        # Note as of Biopython 1.69, this is an ordered-dict, don't sort it:
+        for key, values in feature.qualifiers.items():
+            if isinstance(values, (list, tuple)):
+                for value in values:
+                    self._write_feature_qualifier(key, value)
+            else:
+                # String, int, etc - or None for a /pseudo tpy entry
+                self._write_feature_qualifier(key, values)
+
+    @staticmethod
+    def _get_annotation_str(record, key, default=".", just_first=False):
+        """Get an annotation dictionary entry (as a string) (PRIVATE).
+
+        Some entries are lists, in which case if just_first=True the first entry
+        is returned.  If just_first=False (default) this verifies there is only
+        one entry before returning it.
+        """
+        try:
+            answer = record.annotations[key]
+        except KeyError:
+            return default
+        if isinstance(answer, list):
+            if not just_first:
+                assert len(answer) == 1
+            return str(answer[0])
+        else:
+            return str(answer)
+
+    @staticmethod
+    def _split_multi_line(text, max_len):
+        """Return a list of strings (PRIVATE).
+
+        Any single words which are too long get returned as a whole line
+        (e.g. URLs) without an exception or warning.
+        """
+        # TODO - Do the line splitting while preserving white space?
+        text = text.strip()
+        if len(text) <= max_len:
+            return [text]
+
+        words = text.split()
+        text = ""
+        while words and len(text) + 1 + len(words[0]) <= max_len:
+            text += " " + words.pop(0)
+            text = text.strip()
+        # assert len(text) <= max_len
+        answer = [text]
+        while words:
+            text = words.pop(0)
+            while words and len(text) + 1 + len(words[0]) <= max_len:
+                text += " " + words.pop(0)
+                text = text.strip()
+            # assert len(text) <= max_len
+            answer.append(text)
+        assert not words
+        return answer
+
+    def _split_contig(self, record, max_len):
+        """Return a list of strings, splits on commas (PRIVATE)."""
+        # TODO - Merge this with _write_multi_line method?
+        # It would need the addition of the comma splitting logic...
+        # are there any other cases where that would be sensible?
+        contig = record.annotations.get("contig", "")
+        if isinstance(contig, (list, tuple)):
+            contig = "".join(contig)
+        contig = self.clean(contig)
+        answer = []
+        while contig:
+            if len(contig) > max_len:
+                # Split lines at the commas
+                pos = contig[: max_len - 1].rfind(",")
+                if pos == -1:
+                    raise ValueError("Could not break up CONTIG")
+                text, contig = contig[: pos + 1], contig[pos + 1 :]
+            else:
+                text, contig = contig, ""
+            answer.append(text)
+        return answer
+
+
+class GenBankWriter(_InsdcWriter):
+    """GenBank writer."""
+
+    HEADER_WIDTH = 12
+    QUALIFIER_INDENT = 21
+    STRUCTURED_COMMENT_START = "-START##"
+    STRUCTURED_COMMENT_END = "-END##"
+    STRUCTURED_COMMENT_DELIM = " :: "
+    LETTERS_PER_LINE = 60
+    SEQUENCE_INDENT = 9
+
+    def _write_single_line(self, tag, text):
+        """Write single line in each GenBank record (PRIVATE).
+
+        Used in the 'header' of each GenBank record.
+        """
+        assert len(tag) < self.HEADER_WIDTH
+        if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH:
+            if tag:
+                warnings.warn(
+                    "Annotation %r too long for %r line" % (text, tag), BiopythonWarning
+                )
+            else:
+                # Can't give such a precise warning
+                warnings.warn("Annotation %r too long" % text, BiopythonWarning)
+        self.handle.write(
+            "%s%s\n" % (tag.ljust(self.HEADER_WIDTH), text.replace("\n", " "))
+        )
+
+    def _write_multi_line(self, tag, text):
+        """Write multiple lines in each GenBank record (PRIVATE).
+
+        Used in the 'header' of each GenBank record.
+        """
+        # TODO - Do the line splitting while preserving white space?
+        max_len = self.MAX_WIDTH - self.HEADER_WIDTH
+        lines = self._split_multi_line(text, max_len)
+        self._write_single_line(tag, lines[0])
+        for line in lines[1:]:
+            self._write_single_line("", line)
+
+    def _write_multi_entries(self, tag, text_list):
+        # used for DBLINK and any similar later line types.
+        # If the list of strings is empty, nothing is written.
+        for i, text in enumerate(text_list):
+            if i == 0:
+                self._write_single_line(tag, text)
+            else:
+                self._write_single_line("", text)
+
+    @staticmethod
+    def _get_date(record):
+        default = "01-JAN-1980"
+        try:
+            date = record.annotations["date"]
+        except KeyError:
+            return default
+        # Cope with a list of one string:
+        if isinstance(date, list) and len(date) == 1:
+            date = date[0]
+        if isinstance(date, datetime):
+            date = date.strftime("%d-%b-%Y").upper()
+
+        months = [
+            "JAN",
+            "FEB",
+            "MAR",
+            "APR",
+            "MAY",
+            "JUN",
+            "JUL",
+            "AUG",
+            "SEP",
+            "OCT",
+            "NOV",
+            "DEC",
+        ]
+        if not isinstance(date, str) or len(date) != 11:
+            return default
+        try:
+            datetime(int(date[-4:]), months.index(date[3:6]) + 1, int(date[0:2]))
+        except ValueError:
+            date = default
+        return date
+
+    @staticmethod
+    def _get_data_division(record):
+        try:
+            division = record.annotations["data_file_division"]
+        except KeyError:
+            division = "UNK"
+        if division in [
+            "PRI",
+            "ROD",
+            "MAM",
+            "VRT",
+            "INV",
+            "PLN",
+            "BCT",
+            "VRL",
+            "PHG",
+            "SYN",
+            "UNA",
+            "EST",
+            "PAT",
+            "STS",
+            "GSS",
+            "HTG",
+            "HTC",
+            "ENV",
+            "CON",
+            "TSA",
+        ]:
+            # Good, already GenBank style
+            #    PRI - primate sequences
+            #    ROD - rodent sequences
+            #    MAM - other mammalian sequences
+            #    VRT - other vertebrate sequences
+            #    INV - invertebrate sequences
+            #    PLN - plant, fungal, and algal sequences
+            #    BCT - bacterial sequences [plus archaea]
+            #    VRL - viral sequences
+            #    PHG - bacteriophage sequences
+            #    SYN - synthetic sequences
+            #    UNA - unannotated sequences
+            #    EST - EST sequences (expressed sequence tags)
+            #    PAT - patent sequences
+            #    STS - STS sequences (sequence tagged sites)
+            #    GSS - GSS sequences (genome survey sequences)
+            #    HTG - HTGS sequences (high throughput genomic sequences)
+            #    HTC - HTC sequences (high throughput cDNA sequences)
+            #    ENV - Environmental sampling sequences
+            #    CON - Constructed sequences
+            #    TSA - Transcriptome Shotgun Assembly
+            #
+            # (plus UNK for unknown)
+            pass
+        else:
+            # See if this is in EMBL style:
+            #    Division                 Code
+            #    -----------------        ----
+            #    Bacteriophage            PHG - common
+            #    Environmental Sample     ENV - common
+            #    Fungal                   FUN - map to PLN (plants + fungal)
+            #    Human                    HUM - map to PRI (primates)
+            #    Invertebrate             INV - common
+            #    Other Mammal             MAM - common
+            #    Other Vertebrate         VRT - common
+            #    Mus musculus             MUS - map to ROD (rodent)
+            #    Plant                    PLN - common
+            #    Prokaryote               PRO - map to BCT (poor name)
+            #    Other Rodent             ROD - common
+            #    Synthetic                SYN - common
+            #    Transgenic               TGN - ??? map to SYN ???
+            #    Unclassified             UNC - map to UNK
+            #    Viral                    VRL - common
+            #
+            # (plus XXX for submitting which we can map to UNK)
+            embl_to_gbk = {
+                "FUN": "PLN",
+                "HUM": "PRI",
+                "MUS": "ROD",
+                "PRO": "BCT",
+                "UNC": "UNK",
+                "XXX": "UNK",
+            }
+            try:
+                division = embl_to_gbk[division]
+            except KeyError:
+                division = "UNK"
+        assert len(division) == 3
+        return division
+
+    def _get_topology(self, record):
+        """Set the topology to 'circular', 'linear' if defined (PRIVATE)."""
+        max_topology_len = len("circular")
+
+        topology = self._get_annotation_str(record, "topology", default="")
+        if topology and len(topology) <= max_topology_len:
+            return topology.ljust(max_topology_len)
+        else:
+            return " " * max_topology_len
+
+    def _write_the_first_line(self, record):
+        """Write the LOCUS line (PRIVATE)."""
+        locus = record.name
+        if not locus or locus == "":
+            locus = record.id
+        if not locus or locus == "":
+            locus = self._get_annotation_str(record, "accession", just_first=True)
+        if len(locus) > 16:
+            if len(locus) + 1 + len(str(len(record))) > 28:
+                # Locus name and record length to long to squeeze in.
+                # Per updated GenBank standard (Dec 15, 2018) 229.0
+                # the Locus identifier can be any length, and a space
+                # is added after the identifier to keep the identifier
+                # and length fields separated
+                warnings.warn(
+                    "Increasing length of locus line to allow "
+                    "long name. This will result in fields that "
+                    "are not in usual positions.",
+                    BiopythonWarning,
+                )
+
+        if len(locus.split()) > 1:
+            raise ValueError("Invalid whitespace in %r for LOCUS line" % locus)
+        if len(record) > 99999999999:
+            # As of the GenBank release notes 229.0, the locus line can be
+            # any length. However, long locus lines may not be compatible
+            # with all software.
+            warnings.warn(
+                "The sequence length is very long. The LOCUS "
+                "line will be increased in length to compensate. "
+                "This may cause unexpected behavior.",
+                BiopythonWarning,
+            )
+
+        # Get the molecule type
+        mol_type = self._get_annotation_str(record, "molecule_type", None)
+        if mol_type is None:
+            raise ValueError("missing molecule_type in annotations")
+        if mol_type and len(mol_type) > 7:
+            # Deal with common cases from EMBL to GenBank
+            mol_type = mol_type.replace("unassigned ", "").replace("genomic ", "")
+            if len(mol_type) > 7:
+                warnings.warn("Molecule type %r too long" % mol_type, BiopythonWarning)
+                mol_type = "DNA"
+        if mol_type in ["protein", "PROTEIN"]:
+            mol_type = ""
+
+        if mol_type == "":
+            units = "aa"
+        else:
+            units = "bp"
+
+        topology = self._get_topology(record)
+
+        division = self._get_data_division(record)
+
+        # Accommodate longer header, with long accessions and lengths
+        if len(locus) > 16 and len(str(len(record))) > (11 - (len(locus) - 16)):
+            name_length = locus + " " + str(len(record))
+
+        # This is the older, standard 80 position header
+        else:
+            name_length = str(len(record)).rjust(28)
+            name_length = locus + name_length[len(locus) :]
+            assert len(name_length) == 28, name_length
+            assert " " in name_length, name_length
+
+        assert len(units) == 2
+        assert len(division) == 3
+        line = "LOCUS       %s %s    %s %s %s %s\n" % (
+            name_length,
+            units,
+            mol_type.ljust(7),
+            topology,
+            division,
+            self._get_date(record),
+        )
+        # Extra long header
+        if len(line) > 80:
+            splitline = line.split()
+            if splitline[3] not in ["bp", "aa"]:
+                raise ValueError(
+                    "LOCUS line does not contain size units at "
+                    "expected position:\n" + line
+                )
+
+            if not (
+                splitline[4].strip() == ""
+                or "DNA" in splitline[4].strip().upper()
+                or "RNA" in splitline[4].strip().upper()
+            ):
+                raise ValueError(
+                    "LOCUS line does not contain valid "
+                    "sequence type (DNA, RNA, ...):\n" + line
+                )
+
+            self.handle.write(line)
+
+        # 80 position header
+        else:
+            assert len(line) == 79 + 1, repr(line)  # plus one for new line
+
+            # We're bending the rules to allow an identifier over 16 characters
+            # if we can steal spaces from the length field:
+            # assert line[12:28].rstrip() == locus, \
+            #     'LOCUS line does not contain the locus at the expected position:\n' + line
+            # assert line[28:29] == " "
+            # assert line[29:40].lstrip() == str(len(record)), \
+            #     'LOCUS line does not contain the length at the expected position:\n' + line
+            assert line[12:40].split() == [locus, str(len(record))], line
+
+            # Tests copied from Bio.GenBank.Scanner
+            if line[40:44] not in [" bp ", " aa "]:
+                raise ValueError(
+                    "LOCUS line does not contain size units at "
+                    "expected position:\n" + line
+                )
+            if line[44:47] not in ["   ", "ss-", "ds-", "ms-"]:
+                raise ValueError(
+                    "LOCUS line does not have valid strand "
+                    "type (Single stranded, ...):\n" + line
+                )
+            if not (
+                line[47:54].strip() == ""
+                or "DNA" in line[47:54].strip().upper()
+                or "RNA" in line[47:54].strip().upper()
+            ):
+                raise ValueError(
+                    "LOCUS line does not contain valid "
+                    "sequence type (DNA, RNA, ...):\n" + line
+                )
+            if line[54:55] != " ":
+                raise ValueError(
+                    "LOCUS line does not contain space at position 55:\n" + line
+                )
+            if line[55:63].strip() not in ["", "linear", "circular"]:
+                raise ValueError(
+                    "LOCUS line does not contain valid "
+                    "entry (linear, circular, ...):\n" + line
+                )
+            if line[63:64] != " ":
+                raise ValueError(
+                    "LOCUS line does not contain space at position 64:\n" + line
+                )
+            if line[67:68] != " ":
+                raise ValueError(
+                    "LOCUS line does not contain space at position 68:\n" + line
+                )
+            if line[70:71] != "-":
+                raise ValueError(
+                    "LOCUS line does not contain - at position 71 in date:\n" + line
+                )
+            if line[74:75] != "-":
+                raise ValueError(
+                    "LOCUS line does not contain - at position 75 in date:\n" + line
+                )
+
+            self.handle.write(line)
+
+    def _write_references(self, record):
+        number = 0
+        for ref in record.annotations["references"]:
+            if not isinstance(ref, SeqFeature.Reference):
+                continue
+            number += 1
+            data = str(number)
+            # TODO - support more complex record reference locations?
+            if ref.location and len(ref.location) == 1:
+                molecule_type = record.annotations.get("molecule_type")
+                if molecule_type and "protein" in molecule_type:
+                    units = "residues"
+                else:
+                    units = "bases"
+                data += "  (%s %i to %i)" % (
+                    units,
+                    ref.location[0].nofuzzy_start + 1,
+                    ref.location[0].nofuzzy_end,
+                )
+            self._write_single_line("REFERENCE", data)
+            if ref.authors:
+                # We store the AUTHORS data as a single string
+                self._write_multi_line("  AUTHORS", ref.authors)
+            if ref.consrtm:
+                # We store the consortium as a single string
+                self._write_multi_line("  CONSRTM", ref.consrtm)
+            if ref.title:
+                # We store the title as a single string
+                self._write_multi_line("  TITLE", ref.title)
+            if ref.journal:
+                # We store this as a single string - holds the journal name,
+                # volume, year, and page numbers of the citation
+                self._write_multi_line("  JOURNAL", ref.journal)
+            if ref.medline_id:
+                # This line type is obsolete and was removed from the GenBank
+                # flatfile format in April 2005. Should we write it?
+                # Note this has a two space indent:
+                self._write_multi_line("  MEDLINE", ref.medline_id)
+            if ref.pubmed_id:
+                # Note this has a THREE space indent:
+                self._write_multi_line("   PUBMED", ref.pubmed_id)
+            if ref.comment:
+                self._write_multi_line("  REMARK", ref.comment)
+
+    def _write_comment(self, record):
+        # This is a bit complicated due to the range of possible
+        # ways people might have done their annotation...
+        # Currently the parser uses a single string with newlines.
+        # A list of lines is also reasonable.
+        # A single (long) string is perhaps the most natural of all.
+        # This means we may need to deal with line wrapping.
+        lines = []
+        if "structured_comment" in record.annotations:
+            comment = record.annotations["structured_comment"]
+            # Find max length of keys for equal padded printing
+            padding = 0
+            for key, data in comment.items():
+                for subkey, subdata in data.items():
+                    padding = len(subkey) if len(subkey) > padding else padding
+            # Construct output
+            for key, data in comment.items():
+                lines.append(f"##{key}{self.STRUCTURED_COMMENT_START}")
+                for subkey, subdata in data.items():
+                    spaces = " " * (padding - len(subkey))
+                    lines.append(
+                        f"{subkey}{spaces}{self.STRUCTURED_COMMENT_DELIM}{subdata}"
+                    )
+                lines.append(f"##{key}{self.STRUCTURED_COMMENT_END}")
+        if "comment" in record.annotations:
+            comment = record.annotations["comment"]
+            if isinstance(comment, str):
+                lines += comment.split("\n")
+            elif isinstance(comment, (list, tuple)):
+                lines += list(comment)
+            else:
+                raise ValueError("Could not understand comment annotation")
+        self._write_multi_line("COMMENT", lines[0])
+        for line in lines[1:]:
+            self._write_multi_line("", line)
+
+    def _write_contig(self, record):
+        max_len = self.MAX_WIDTH - self.HEADER_WIDTH
+        lines = self._split_contig(record, max_len)
+        self._write_single_line("CONTIG", lines[0])
+        for text in lines[1:]:
+            self._write_single_line("", text)
+
+    def _write_sequence(self, record):
+        # Loosely based on code from Howard Salis
+        # TODO - Force lower case?
+
+        if isinstance(record.seq, UnknownSeq):
+            data = None
+        else:
+            try:
+                data = _get_seq_string(record)
+            except UndefinedSequenceError:
+                data = None
+
+        if data is None:
+            # We have already recorded the length, and there is no need
+            # to record a long sequence of NNNNNNN...NNN or whatever.
+            if "contig" in record.annotations:
+                self._write_contig(record)
+            else:
+                self.handle.write("ORIGIN\n")
+            return
+
+        # Catches sequence being None:
+        data = data.lower()
+        seq_len = len(data)
+        self.handle.write("ORIGIN\n")
+        for line_number in range(0, seq_len, self.LETTERS_PER_LINE):
+            self.handle.write(str(line_number + 1).rjust(self.SEQUENCE_INDENT))
+            for words in range(
+                line_number, min(line_number + self.LETTERS_PER_LINE, seq_len), 10
+            ):
+                self.handle.write(" %s" % data[words : words + 10])
+            self.handle.write("\n")
+
+    def write_record(self, record):
+        """Write a single record to the output file."""
+        handle = self.handle
+        self._write_the_first_line(record)
+
+        default = record.id
+        if default.count(".") == 1 and default[default.index(".") + 1 :].isdigit():
+            # Good, looks like accession.version and not something
+            # else like identifier.start-end
+            default = record.id.split(".", 1)[0]
+        accession = self._get_annotation_str(
+            record, "accession", default, just_first=True
+        )
+        acc_with_version = accession
+        if record.id.startswith(accession + "."):
+            try:
+                acc_with_version = "%s.%i" % (
+                    accession,
+                    int(record.id.split(".", 1)[1]),
+                )
+            except ValueError:
+                pass
+        gi = self._get_annotation_str(record, "gi", just_first=True)
+
+        descr = record.description
+        if descr == "":
+            descr = ""  # Trailing dot will be added later
+
+        # The DEFINITION field must end with a period
+        # see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt [3.4.5]
+        # and discussion https://github.com/biopython/biopython/pull/616
+        # So let's add a period
+        descr += "."
+        self._write_multi_line("DEFINITION", descr)
+
+        self._write_single_line("ACCESSION", accession)
+        if gi != ".":
+            self._write_single_line("VERSION", "%s  GI:%s" % (acc_with_version, gi))
+        else:
+            self._write_single_line("VERSION", "%s" % acc_with_version)
+
+        # The NCBI initially expected two types of link,
+        # e.g. "Project:28471" and "Trace Assembly Archive:123456"
+        #
+        # This changed and at some point the formatting switched to
+        # include a space after the colon, e.g.
+        #
+        # LOCUS       NC_000011               1606 bp    DNA     linear   CON 06-JUN-2016
+        # DEFINITION  Homo sapiens chromosome 11, GRCh38.p7 Primary Assembly.
+        # ACCESSION   NC_000011 REGION: complement(5225466..5227071) GPC_000001303
+        # VERSION     NC_000011.10  GI:568815587
+        # DBLINK      BioProject: PRJNA168
+        #             Assembly: GCF_000001405.33
+        # ...
+        #
+        # Or,
+        #
+        # LOCUS       JU120277                1044 bp    mRNA    linear   TSA 27-NOV-2012
+        # DEFINITION  TSA: Tupaia chinensis tbc000002.Tuchadli mRNA sequence.
+        # ACCESSION   JU120277
+        # VERSION     JU120277.1  GI:379775257
+        # DBLINK      BioProject: PRJNA87013
+        #             Sequence Read Archive: SRR433859
+        # ...
+        dbxrefs_with_space = []
+        for x in record.dbxrefs:
+            if ": " not in x:
+                x = x.replace(":", ": ")
+            dbxrefs_with_space.append(x)
+        self._write_multi_entries("DBLINK", dbxrefs_with_space)
+        del dbxrefs_with_space
+
+        try:
+            # List of strings
+            # Keywords should be given separated with semi colons,
+            keywords = "; ".join(record.annotations["keywords"])
+            # with a trailing period:
+            if not keywords.endswith("."):
+                keywords += "."
+        except KeyError:
+            # If no keywords, there should be just a period:
+            keywords = "."
+        self._write_multi_line("KEYWORDS", keywords)
+
+        if "segment" in record.annotations:
+            # Deal with SEGMENT line found only in segmented records,
+            # e.g. AH000819
+            segment = record.annotations["segment"]
+            if isinstance(segment, list):
+                assert len(segment) == 1, segment
+                segment = segment[0]
+            self._write_single_line("SEGMENT", segment)
+
+        self._write_multi_line("SOURCE", self._get_annotation_str(record, "source"))
+        # The ORGANISM line MUST be a single line, as any continuation is the taxonomy
+        org = self._get_annotation_str(record, "organism")
+        if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH:
+            org = org[: self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..."
+        self._write_single_line("  ORGANISM", org)
+        try:
+            # List of strings
+            # Taxonomy should be given separated with semi colons,
+            taxonomy = "; ".join(record.annotations["taxonomy"])
+            # with a trailing period:
+            if not taxonomy.endswith("."):
+                taxonomy += "."
+        except KeyError:
+            taxonomy = "."
+        self._write_multi_line("", taxonomy)
+
+        if "db_source" in record.annotations:
+            # Hack around the issue of BioSQL loading a list for the db_source
+            db_source = record.annotations["db_source"]
+            if isinstance(db_source, list):
+                db_source = db_source[0]
+            self._write_single_line("DBSOURCE", db_source)
+
+        if "references" in record.annotations:
+            self._write_references(record)
+
+        if (
+            "comment" in record.annotations
+            or "structured_comment" in record.annotations
+        ):
+            self._write_comment(record)
+
+        handle.write("FEATURES             Location/Qualifiers\n")
+        rec_length = len(record)
+        for feature in record.features:
+            self._write_feature(feature, rec_length)
+        self._write_sequence(record)
+        handle.write("//\n")
+
+
+class EmblWriter(_InsdcWriter):
+    """EMBL writer."""
+
+    HEADER_WIDTH = 5
+    QUALIFIER_INDENT = 21
+    QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2)
+    QUALIFIER_INDENT_TMP = "FT   %s                "  # 21 if %s is empty
+    # Note second spacer line of just FH is expected:
+    FEATURE_HEADER = "FH   Key             Location/Qualifiers\nFH\n"
+
+    LETTERS_PER_BLOCK = 10
+    BLOCKS_PER_LINE = 6
+    LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE
+    POSITION_PADDING = 10
+
+    def _write_contig(self, record):
+        max_len = self.MAX_WIDTH - self.HEADER_WIDTH
+        lines = self._split_contig(record, max_len)
+        for text in lines:
+            self._write_single_line("CO", text)
+
+    def _write_sequence(self, record):
+        handle = self.handle  # save looking up this multiple times
+
+        if isinstance(record.seq, UnknownSeq):
+            data = None
+        else:
+            try:
+                data = _get_seq_string(record)
+            except UndefinedSequenceError:
+                data = None
+
+        if data is None:
+            # We have already recorded the length, and there is no need
+            # to record a long sequence of NNNNNNN...NNN or whatever.
+            if "contig" in record.annotations:
+                self._write_contig(record)
+            else:
+                # TODO - Can the sequence just be left out as in GenBank files?
+                handle.write("SQ   \n")
+            return
+
+        # Catches sequence being None
+        data = data.lower()
+        seq_len = len(data)
+
+        molecule_type = record.annotations.get("molecule_type")
+        if molecule_type is not None and "DNA" in molecule_type:
+            # TODO - What if we have RNA?
+            a_count = data.count("A") + data.count("a")
+            c_count = data.count("C") + data.count("c")
+            g_count = data.count("G") + data.count("g")
+            t_count = data.count("T") + data.count("t")
+            other = seq_len - (a_count + c_count + g_count + t_count)
+            handle.write(
+                "SQ   Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n"
+                % (seq_len, a_count, c_count, g_count, t_count, other)
+            )
+        else:
+            handle.write("SQ   \n")
+
+        for line_number in range(0, seq_len // self.LETTERS_PER_LINE):
+            handle.write("    ")  # Just four, not five
+            for block in range(self.BLOCKS_PER_LINE):
+                index = (
+                    self.LETTERS_PER_LINE * line_number + self.LETTERS_PER_BLOCK * block
+                )
+                handle.write(" %s" % data[index : index + self.LETTERS_PER_BLOCK])
+            handle.write(
+                str((line_number + 1) * self.LETTERS_PER_LINE).rjust(
+                    self.POSITION_PADDING
+                )
+            )
+            handle.write("\n")
+        if seq_len % self.LETTERS_PER_LINE:
+            # Final (partial) line
+            line_number = seq_len // self.LETTERS_PER_LINE
+            handle.write("    ")  # Just four, not five
+            for block in range(self.BLOCKS_PER_LINE):
+                index = (
+                    self.LETTERS_PER_LINE * line_number + self.LETTERS_PER_BLOCK * block
+                )
+                handle.write(
+                    (" %s" % data[index : index + self.LETTERS_PER_BLOCK]).ljust(11)
+                )
+            handle.write(str(seq_len).rjust(self.POSITION_PADDING))
+            handle.write("\n")
+
+    def _write_single_line(self, tag, text):
+        assert len(tag) == 2
+        line = tag + "   " + text
+        if len(text) > self.MAX_WIDTH:
+            warnings.warn("Line %r too long" % line, BiopythonWarning)
+        self.handle.write(line + "\n")
+
+    def _write_multi_line(self, tag, text):
+        max_len = self.MAX_WIDTH - self.HEADER_WIDTH
+        lines = self._split_multi_line(text, max_len)
+        for line in lines:
+            self._write_single_line(tag, line)
+
+    def _write_the_first_lines(self, record):
+        """Write the ID and AC lines (PRIVATE)."""
+        if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
+            version = "SV " + record.id.rsplit(".", 1)[1]
+            accession = self._get_annotation_str(
+                record, "accession", record.id.rsplit(".", 1)[0], just_first=True
+            )
+        else:
+            version = ""
+            accession = self._get_annotation_str(
+                record, "accession", record.id, just_first=True
+            )
+
+        if ";" in accession:
+            raise ValueError(
+                "Cannot have semi-colon in EMBL accession, '%s'" % accession
+            )
+        if " " in accession:
+            # This is out of practicality... might it be allowed?
+            raise ValueError("Cannot have spaces in EMBL accession, '%s'" % accession)
+
+        topology = self._get_annotation_str(record, "topology", default="")
+
+        # Get the molecule type
+        # TODO - record this explicitly in the parser?
+        # Note often get RNA vs DNA discrepancy in real EMBL/NCBI files
+        mol_type = record.annotations.get("molecule_type")
+        if mol_type is None:
+            raise ValueError("missing molecule_type in annotations")
+        if mol_type not in ("DNA", "RNA", "protein"):
+            warnings.warn("Non-standard molecule type: %s" % mol_type, BiopythonWarning)
+        mol_type_upper = mol_type.upper()
+        if "DNA" in mol_type_upper:
+            units = "BP"
+        elif "RNA" in mol_type_upper:
+            units = "BP"
+        elif "PROTEIN" in mol_type_upper:
+            mol_type = "PROTEIN"
+            units = "AA"
+        else:
+            raise ValueError("failed to understand molecule_type '%s'" % mol_type)
+
+        # Get the taxonomy division
+        division = self._get_data_division(record)
+
+        # TODO - Full ID line
+        handle = self.handle
+        # ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
+        # 1. Primary accession number
+        # 2. Sequence version number
+        # 3. Topology: 'circular' or 'linear'
+        # 4. Molecule type
+        # 5. Data class
+        # 6. Taxonomic division
+        # 7. Sequence length
+        self._write_single_line(
+            "ID",
+            "%s; %s; %s; %s; ; %s; %i %s."
+            % (accession, version, topology, mol_type, division, len(record), units),
+        )
+        handle.write("XX\n")
+        self._write_single_line("AC", accession + ";")
+        handle.write("XX\n")
+
+    @staticmethod
+    def _get_data_division(record):
+        try:
+            division = record.annotations["data_file_division"]
+        except KeyError:
+            division = "UNC"
+        if division in [
+            "PHG",
+            "ENV",
+            "FUN",
+            "HUM",
+            "INV",
+            "MAM",
+            "VRT",
+            "MUS",
+            "PLN",
+            "PRO",
+            "ROD",
+            "SYN",
+            "TGN",
+            "UNC",
+            "VRL",
+            "XXX",
+        ]:
+            # Good, already EMBL style
+            #    Division                 Code
+            #    -----------------        ----
+            #    Bacteriophage            PHG
+            #    Environmental Sample     ENV
+            #    Fungal                   FUN
+            #    Human                    HUM
+            #    Invertebrate             INV
+            #    Other Mammal             MAM
+            #    Other Vertebrate         VRT
+            #    Mus musculus             MUS
+            #    Plant                    PLN
+            #    Prokaryote               PRO
+            #    Other Rodent             ROD
+            #    Synthetic                SYN
+            #    Transgenic               TGN
+            #    Unclassified             UNC (i.e. unknown)
+            #    Viral                    VRL
+            #
+            # (plus XXX used for submitting data to EMBL)
+            pass
+        else:
+            # See if this is in GenBank style & can be converted.
+            # Generally a problem as the GenBank groups are wider
+            # than those of EMBL. Note that GenBank use "BCT" for
+            # both bacteria and archaea thus this maps to EMBL's
+            # "PRO" nicely.
+            gbk_to_embl = {"BCT": "PRO", "UNK": "UNC"}
+            try:
+                division = gbk_to_embl[division]
+            except KeyError:
+                division = "UNC"
+        assert len(division) == 3
+        return division
+
+    def _write_keywords(self, record):
+        # Put the keywords right after DE line.
+        # Each 'keyword' can have multiple words and spaces, but we
+        # must not split any 'keyword' between lines.
+        # TODO - Combine short keywords onto one line
+        for keyword in record.annotations["keywords"]:
+            self._write_single_line("KW", keyword)
+        self.handle.write("XX\n")
+
+    def _write_references(self, record):
+        # The order should be RN, RC, RP, RX, RG, RA, RT, RL
+        number = 0
+        for ref in record.annotations["references"]:
+            if not isinstance(ref, SeqFeature.Reference):
+                continue
+            number += 1
+            self._write_single_line("RN", "[%i]" % number)
+            # TODO - support for RC line (needed in parser too)
+            # TODO - support more complex record reference locations?
+            if ref.location and len(ref.location) == 1:
+                self._write_single_line(
+                    "RP",
+                    "%i-%i"
+                    % (ref.location[0].nofuzzy_start + 1, ref.location[0].nofuzzy_end),
+                )
+            # TODO - record any DOI or AGRICOLA identifier in the reference object?
+            if ref.pubmed_id:
+                self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id)
+            if ref.consrtm:
+                self._write_single_line("RG", "%s" % ref.consrtm)
+            if ref.authors:
+                # We store the AUTHORS data as a single string
+                self._write_multi_line("RA", ref.authors + ";")
+            if ref.title:
+                # We store the title as a single string
+                self._write_multi_line("RT", '"%s";' % ref.title)
+            if ref.journal:
+                # We store this as a single string - holds the journal name,
+                # volume, year, and page numbers of the citation
+                self._write_multi_line("RL", ref.journal)
+            self.handle.write("XX\n")
+
+    def _write_comment(self, record):
+        # This is a bit complicated due to the range of possible
+        # ways people might have done their annotation...
+        # Currently the parser uses a single string with newlines.
+        # A list of lines is also reasonable.
+        # A single (long) string is perhaps the most natural of all.
+        # This means we may need to deal with line wrapping.
+        comment = record.annotations["comment"]
+        if isinstance(comment, str):
+            lines = comment.split("\n")
+        elif isinstance(comment, (list, tuple)):
+            lines = comment
+        else:
+            raise ValueError("Could not understand comment annotation")
+        # TODO - Merge this with the GenBank comment code?
+        if not lines:
+            return
+        for line in lines:
+            self._write_multi_line("CC", line)
+        self.handle.write("XX\n")
+
+    def write_record(self, record):
+        """Write a single record to the output file."""
+        handle = self.handle
+        self._write_the_first_lines(record)
+
+        # PR line (0 or 1 lines only), project identifier
+        #
+        # Assuming can't use 2 lines, we should prefer newer GenBank
+        # DBLINK BioProject:... entries over the older GenBank DBLINK
+        # Project:... lines.
+        #
+        # In either case, seems EMBL uses just "PR    Project:..."
+        # regardless of the type of ID (old numeric only, or new
+        # with alpha prefix), e.g. for CP002497 NCBI now uses:
+        #
+        # DBLINK      BioProject: PRJNA60715
+        #             BioSample: SAMN03081426
+        #
+        # While EMBL uses:
+        #
+        # XX
+        # PR   Project:PRJNA60715;
+        # XX
+        #
+        # Sorting ensures (new) BioProject:... is before old Project:...
+        for xref in sorted(record.dbxrefs):
+            if xref.startswith("BioProject:"):
+                self._write_single_line("PR", xref[3:] + ";")
+                handle.write("XX\n")
+                break
+            if xref.startswith("Project:"):
+                self._write_single_line("PR", xref + ";")
+                handle.write("XX\n")
+                break
+
+        # TODO - DT lines (date)
+
+        descr = record.description
+        if descr == "":
+            descr = "."
+        self._write_multi_line("DE", descr)
+        handle.write("XX\n")
+
+        if "keywords" in record.annotations:
+            self._write_keywords(record)
+
+        # Should this be "source" or "organism"?
+        self._write_multi_line("OS", self._get_annotation_str(record, "organism"))
+        try:
+            # List of strings
+            taxonomy = "; ".join(record.annotations["taxonomy"]) + "."
+        except KeyError:
+            taxonomy = "."
+        self._write_multi_line("OC", taxonomy)
+        handle.write("XX\n")
+
+        if "references" in record.annotations:
+            self._write_references(record)
+
+        if "comment" in record.annotations:
+            self._write_comment(record)
+
+        handle.write(self.FEATURE_HEADER)
+        rec_length = len(record)
+        for feature in record.features:
+            self._write_feature(feature, rec_length)
+        handle.write("XX\n")
+
+        self._write_sequence(record)
+        handle.write("//\n")
+
+
+class ImgtWriter(EmblWriter):
+    """IMGT writer (EMBL format variant)."""
+
+    HEADER_WIDTH = 5
+    QUALIFIER_INDENT = 25  # Not 21 as in EMBL
+    QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2)
+    QUALIFIER_INDENT_TMP = "FT   %s                    "  # 25 if %s is empty
+    FEATURE_HEADER = "FH   Key                 Location/Qualifiers\nFH\n"
+
+
+def _genbank_convert_fasta(in_file, out_file):
+    """Fast GenBank to FASTA (PRIVATE)."""
+    # We don't need to parse the features...
+    records = GenBankScanner().parse_records(in_file, do_features=False)
+    return SeqIO.write(records, out_file, "fasta")
+
+
+def _embl_convert_fasta(in_file, out_file):
+    """Fast EMBL to FASTA (PRIVATE)."""
+    # We don't need to parse the features...
+    records = EmblScanner().parse_records(in_file, do_features=False)
+    return SeqIO.write(records, out_file, "fasta")
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/Interfaces.py b/code/lib/Bio/SeqIO/Interfaces.py
new file mode 100644
index 0000000..e10b923
--- /dev/null
+++ b/code/lib/Bio/SeqIO/Interfaces.py
@@ -0,0 +1,376 @@
+# Copyright 2006-2013 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support module (not for general use).
+
+Unless you are writing a new parser or writer for Bio.SeqIO, you should not
+use this module.  It provides base classes to try and simplify things.
+"""
+import warnings
+
+from abc import ABC
+from abc import abstractmethod
+
+from Bio import BiopythonDeprecationWarning
+from Bio import StreamModeError
+from Bio.Seq import MutableSeq
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+class SequenceIterator(ABC):
+    """Base class for building SeqRecord iterators.
+
+    You should write a parse method that returns a SeqRecord generator.  You
+    may wish to redefine the __init__ method as well.
+    """
+
+    def __init__(self, source, alphabet=None, mode="t", fmt=None):
+        """Create a SequenceIterator object.
+
+        Arguments:
+        - source - input file stream, or path to input file
+        - alphabet - no longer used, should be None
+
+        This method MAY be overridden by any subclass.
+
+        Note when subclassing:
+        - there should be a single non-optional argument, the source.
+        - you do not have to require an alphabet.
+        - you can add additional optional arguments.
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        try:
+            self.stream = open(source, "r" + mode)
+            self.should_close_stream = True
+        except TypeError:  # not a path, assume we received a stream
+            if mode == "t":
+                if source.read(0) != "":
+                    raise StreamModeError(
+                        "%s files must be opened in text mode." % fmt
+                    ) from None
+            elif mode == "b":
+                if source.read(0) != b"":
+                    raise StreamModeError(
+                        "%s files must be opened in binary mode." % fmt
+                    ) from None
+            else:
+                raise ValueError("Unknown mode '%s'" % mode) from None
+            self.stream = source
+            self.should_close_stream = False
+        try:
+            self.records = self.parse(self.stream)
+        except Exception:
+            if self.should_close_stream:
+                self.stream.close()
+            raise
+
+    def __next__(self):
+        try:
+            return next(self.records)
+        except Exception:
+            if self.should_close_stream:
+                self.stream.close()
+            raise
+
+    def __iter__(self):
+        """Iterate over the entries as a SeqRecord objects.
+
+        Example usage for Fasta files::
+
+            with open("example.fasta","r") as myFile:
+                myFastaReader = FastaIterator(myFile)
+                for record in myFastaReader:
+                    print(record.id)
+                    print(record.seq)
+
+        This method SHOULD NOT be overridden by any subclass. It should be
+        left as is, which will call the subclass implementation of __next__
+        to actually parse the file.
+        """
+        return self
+
+    @abstractmethod
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord iterator."""
+
+
+def _get_seq_string(record):
+    """Use this to catch errors like the sequence being None (PRIVATE)."""
+    if not isinstance(record, SeqRecord):
+        raise TypeError("Expected a SeqRecord object")
+    if record.seq is None:
+        raise TypeError("SeqRecord (id=%s) has None for its sequence." % record.id)
+    elif not isinstance(record.seq, (Seq, MutableSeq)):
+        raise TypeError("SeqRecord (id=%s) has an invalid sequence." % record.id)
+    return str(record.seq)
+
+
+# Function variant of the SequenceWriter method.
+def _clean(text):
+    """Use this to avoid getting newlines in the output (PRIVATE)."""
+    return text.replace("\n", " ").replace("\r", " ")
+
+
+class SequenceWriter:
+    """Base class for sequence writers. This class should be subclassed.
+
+    It is intended for sequential file formats with an (optional)
+    header, repeated records, and an (optional) footer, as well
+    as for interlaced file formats such as Clustal.
+
+    The user may call the write_file() method to write a complete
+    file containing the sequences.
+
+    Alternatively, users may call the write_header(), followed
+    by multiple calls to write_record() and/or write_records(),
+    followed finally by write_footer().
+
+    Note that write_header() cannot require any assumptions about
+    the number of records.
+    """
+
+    def __init__(self, target, mode="w"):
+        """Create the writer object."""
+        if mode == "w":
+            try:
+                target.write("")
+            except TypeError:
+                # target was opened in binary mode
+                raise StreamModeError("File must be opened in text mode.") from None
+            except AttributeError:
+                # target is a path
+                handle = open(target, mode)
+            else:
+                handle = target
+        elif mode == "wb":
+            try:
+                target.write(b"")
+            except TypeError:
+                # target was opened in text mode
+                raise StreamModeError("File must be opened in binary mode.") from None
+            except AttributeError:
+                # target is a path
+                handle = open(target, mode)
+            else:
+                handle = target
+        else:
+            raise RuntimeError("Unknown mode '%s'" % mode)
+
+        self._target = target
+        self.handle = handle
+
+    def clean(self, text):
+        """Use this to avoid getting newlines in the output."""
+        return text.replace("\n", " ").replace("\r", " ")
+
+    def write_header(self):
+        """Write the file header to the output file."""
+        pass
+        ##################################################
+        # You MUST implement this method in the subclass #
+        # if the file format defines a file header.      #
+        ##################################################
+
+    def write_footer(self):
+        """Write the file footer to the output file."""
+        pass
+        ##################################################
+        # You MUST implement this method in the subclass #
+        # if the file format defines a file footer.      #
+        ##################################################
+
+    def write_record(self, record):
+        """Write a single record to the output file.
+
+        record - a SeqRecord object
+        """
+        raise NotImplementedError("This method should be implemented")
+        ##################################################
+        # You MUST implement this method in the subclass #
+        # for sequential file formats.                   #
+        ##################################################
+
+    def write_records(self, records, maxcount=None):
+        """Write records to the output file, and return the number of records.
+
+        records - A list or iterator returning SeqRecord objects
+        maxcount - The maximum number of records allowed by the
+        file format, or None if there is no maximum.
+        """
+        count = 0
+        if maxcount is None:
+            for record in records:
+                self.write_record(record)
+                count += 1
+        else:
+            for record in records:
+                if count == maxcount:
+                    if maxcount == 1:
+                        raise ValueError("More than one sequence found")
+                    else:
+                        raise ValueError(
+                            "Number of sequences is larger than %d" % maxcount
+                        )
+                self.write_record(record)
+                count += 1
+        return count
+
+    def write_file(self, records, mincount=0, maxcount=None):
+        """Write a complete file with the records, and return the number of records.
+
+        records - A list or iterator returning SeqRecord objects
+        """
+        ##################################################
+        # You MUST implement this method in the subclass #
+        # for interlaced file formats.                   #
+        ##################################################
+        try:
+            self.write_header()
+            count = self.write_records(records, maxcount)
+            self.write_footer()
+        finally:
+            if self.handle is not self._target:
+                self.handle.close()
+        if count < mincount:
+            if mincount == 1:  # Common case
+                raise ValueError("Must have one sequence")
+            elif mincount == maxcount:
+                raise ValueError(
+                    "Number of sequences is %d (expected %d)" % (count, mincount)
+                )
+            else:
+                raise ValueError(
+                    "Number of sequences is %d (expected at least %d)"
+                    % (count, mincount)
+                )
+        return count
+
+
+class SequentialSequenceWriter(SequenceWriter):
+    """Base class for sequential sequence writers (DEPRECATED).
+
+    This class should be subclassed. It is no longer used.
+    It was intended for sequential file formats with an (optional)
+    header, repeated records, and an (optional) footer. It would
+    enforce callign the methods in appropriate order. To update
+    code using ``SequentialSequenceWriter``, just subclass
+    ``SequenceWriter`` and drop the ``._header_written`` etc
+    checks (or reimplement them).
+
+    In this case (as with interlaced file formats), the user may
+    simply call the write_file() method and be done.
+
+    However, they may also call the write_header(), followed
+    by multiple calls to write_record() and/or write_records()
+    followed finally by write_footer().
+
+    Users must call write_header() and write_footer() even when
+    the file format concerned doesn't have a header or footer.
+    This is to try and make life as easy as possible when
+    switching the output format.
+
+    Note that write_header() cannot require any assumptions about
+    the number of records.
+    """
+
+    def __init__(self, target, mode="w"):
+        """Initialize the class."""
+        super().__init__(target, mode)
+        self._header_written = False
+        self._record_written = False
+        self._footer_written = False
+        warnings.warn(
+            "SequentialSequenceWriter has been deprecated, any class "
+            "subclassing it will need to subclass SequenceWriter instead.",
+            BiopythonDeprecationWarning,
+        )
+
+    def write_header(self):
+        """Write the file header.
+
+        If your file format defines a header, you should implement this method
+        in order to write the header before any of the records.
+
+        The default implementation checks the private attribute ._header_written
+        to ensure the header is only written once.
+        """
+        assert not self._header_written, "You have aleady called write_header()"
+        assert (
+            not self._record_written
+        ), "You have aleady called write_record() or write_records()"
+        assert not self._footer_written, "You have aleady called write_footer()"
+        self._header_written = True
+
+    def write_footer(self):
+        """Write the file footer.
+
+        If your file format defines a footer, you should implement this method
+        in order to write the footer after all the records.
+
+        The default implementation checks the private attribute ._footer_written
+        to ensure the footer is only written once.
+        """
+        assert self._header_written, "You must call write_header() first"
+        assert (
+            self._record_written
+        ), "You have not called write_record() or write_records() yet"
+        assert not self._footer_written, "You have aleady called write_footer()"
+        self._footer_written = True
+
+    def write_record(self, record):
+        """Write a single record to the output file.
+
+        record - a SeqRecord object
+
+        Once you have called write_header() you can call write_record()
+        and/or write_records() as many times as needed.  Then call
+        write_footer() and close().
+        """
+        assert self._header_written, "You must call write_header() first"
+        assert not self._footer_written, "You have already called write_footer()"
+        self._record_written = True
+        raise NotImplementedError("This object should be subclassed")
+
+    def write_records(self, records):
+        """Write multiple record to the output file.
+
+        records - A list or iterator returning SeqRecord objects
+
+        Once you have called write_header() you can call write_record()
+        and/or write_records() as many times as needed.  Then call
+        write_footer() and close().
+
+        Returns the number of records written.
+        """
+        # Default implementation:
+        assert self._header_written, "You must call write_header() first"
+        assert not self._footer_written, "You have already called write_footer()"
+        count = 0
+        for record in records:
+            self.write_record(record)
+            count += 1
+        # Mark as true, even if there where no records
+        self._record_written = True
+        return count
+
+    def write_file(self, records):
+        """Use this to write an entire file containing the given records.
+
+        records - A list or iterator returning SeqRecord objects
+
+        This method can only be called once.  Returns the number of records
+        written.
+        """
+        try:
+            self.write_header()
+            count = self.write_records(records)
+            self.write_footer()
+        finally:
+            if self.handle is not self._target:
+                self.handle.close()
+        return count
diff --git a/code/lib/Bio/SeqIO/NibIO.py b/code/lib/Bio/SeqIO/NibIO.py
new file mode 100644
index 0000000..8c6e84c
--- /dev/null
+++ b/code/lib/Bio/SeqIO/NibIO.py
@@ -0,0 +1,170 @@
+# Copyright 2019 by Michiel de Hoon.  All rights reserved.
+# Based on code contributed and copyright 2016 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the UCSC nib file format.
+
+Nib stands for nibble (4 bit) representation of nucleotide sequences.
+The two nibbles in a byte each store one nucleotide, represented numerically
+as follows:
+
+    - ``0`` - T
+    - ``1`` - C
+    - ``2`` - A
+    - ``3`` - G
+    - ``4`` - N (unknown)
+
+As the first bit in a nibble is set if the nucleotide is soft-masked, we
+additionally have:
+
+    - ``8`` - t
+    - ``9`` - c
+    - ``a`` - a
+    - ``b`` - g
+    - ``c`` - n (unknown)
+
+A nib file contains only one sequence record.
+You are expected to use this module via the Bio.SeqIO functions under
+the format name "nib":
+
+    >>> from Bio import SeqIO
+    >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
+    >>> print("%i %s..." % (len(record), record.seq[:20]))
+    50 nAGAAGagccgcNGgCActt...
+
+For detailed information on the file format, please see the UCSC
+description at https://genome.ucsc.edu/FAQ/FAQformat.html.
+"""
+import binascii
+import struct
+import sys
+
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+class NibIterator(SequenceIterator):
+    """Parser for nib files."""
+
+    def __init__(self, source):
+        """Iterate over a nib file and yield a SeqRecord.
+
+            - source - a file-like object or a path to a file in the nib file
+              format as defined by UCSC; the file must be opened in binary mode.
+
+        Note that a nib file always contains only one sequence record.
+        The sequence of the resulting SeqRecord object should match the sequence
+        generated by Jim Kent's nibFrag utility run with the -masked option.
+
+        This function is used internally via the Bio.SeqIO functions:
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
+        >>> print("%s %i" % (record.seq, len(record)))
+        nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
+
+        You can also call it directly:
+
+        >>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
+        ...     for record in NibIterator(handle):
+        ...         print("%s %i" % (record.seq, len(record)))
+        ...
+        nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
+
+        """
+        super().__init__(source, mode="b", fmt="Nib")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        word = handle.read(4)
+        if not word:
+            raise ValueError("Empty file.")
+        signature = word.hex()
+        if signature == "3a3de96b":
+            byteorder = "little"  # little-endian
+        elif signature == "6be93d3a":
+            byteorder = "big"  # big-endian
+        else:
+            raise ValueError("unexpected signature in nib header")
+        records = self.iterate(handle, byteorder)
+        return records
+
+    def iterate(self, handle, byteorder):
+        """Iterate over the records in the nib file."""
+        number = handle.read(4)
+        length = int.from_bytes(number, byteorder)
+        data = handle.read()
+        indices = binascii.hexlify(data)
+        if length % 2 == 0:
+            if len(indices) != length:
+                raise ValueError("Unexpected file size")
+        elif length % 2 == 1:
+            if len(indices) != length + 1:
+                raise ValueError("Unexpected file size")
+            indices = indices[:length]
+        if not set(indices).issubset(b"0123489abc"):
+            raise ValueError("Unexpected sequence data found in file")
+        table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn")
+        nucleotides = indices.translate(table)
+        sequence = Seq(nucleotides)
+        record = SeqRecord(sequence)
+        yield record
+
+
+class NibWriter(SequenceWriter):
+    """Nib file writer."""
+
+    def __init__(self, target):
+        """Initialize a Nib writer object.
+
+        Arguments:
+         - target - output stream opened in binary mode, or a path to a file
+
+        """
+        super().__init__(target, mode="wb")
+
+    def write_header(self):
+        """Write the file header."""
+        super().write_header()
+        handle = self.handle
+        byteorder = sys.byteorder
+        if byteorder == "little":  # little-endian
+            signature = "3a3de96b"
+        elif byteorder == "big":  # big-endian
+            signature = "6be93d3a"
+        else:
+            raise RuntimeError("unexpected system byte order %s" % byteorder)
+        handle.write(bytes.fromhex(signature))
+
+    def write_record(self, record):
+        """Write a single record to the output file."""
+        handle = self.handle
+        sequence = record.seq
+        nucleotides = bytes(sequence)
+        length = len(sequence)
+        handle.write(struct.pack("i", length))
+        table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc")
+        padding = length % 2
+        suffix = padding * b"T"
+        nucleotides += suffix
+        if not set(nucleotides).issubset(b"ACGTNacgtn"):
+            raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only")
+        indices = nucleotides.translate(table)
+        handle.write(binascii.unhexlify(indices))
+
+    def write_file(self, records):
+        """Write the complete file with the records, and return the number of records."""
+        count = super().write_file(records, mincount=1, maxcount=1)
+        return count
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/PdbIO.py b/code/lib/Bio/SeqIO/PdbIO.py
new file mode 100644
index 0000000..a5f0bf2
--- /dev/null
+++ b/code/lib/Bio/SeqIO/PdbIO.py
@@ -0,0 +1,515 @@
+# Copyright 2012 by Eric Talevich.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for accessing sequences in PDB and mmCIF files."""
+import collections
+import warnings
+
+from Bio import BiopythonParserWarning
+from Bio.Data.IUPACData import protein_letters_3to1_extended as iupac_3to1_ext
+from Bio.Data.SCOPData import protein_letters_3to1 as scop_3to1
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+
+
+_aa3to1_dict = {}
+_aa3to1_dict.update(iupac_3to1_ext)
+_aa3to1_dict.update(scop_3to1)
+
+
+def _res2aacode(residue, undef_code="X"):
+    """Return the one-letter amino acid code from the residue name.
+
+    Non-amino acid are returned as "X".
+    """
+    if isinstance(residue, str):
+        return _aa3to1_dict.get(residue, undef_code)
+
+    return _aa3to1_dict.get(residue.resname, undef_code)
+
+
+def AtomIterator(pdb_id, structure):
+    """Return SeqRecords from Structure objects.
+
+    Base function for sequence parsers that read structures Bio.PDB parsers.
+
+    Once a parser from Bio.PDB has been used to load a structure into a
+    Bio.PDB.Structure.Structure object, there is no difference in how the
+    sequence parser interprets the residue sequence. The functions in this
+    module may be used by SeqIO modules wishing to parse sequences from lists
+    of residues.
+
+    Calling funtions must pass a Bio.PDB.Structure.Structure object.
+
+
+    See Bio.SeqIO.PdbIO.PdbAtomIterator and Bio.SeqIO.PdbIO.CifAtomIterator for
+    details.
+    """
+    model = structure[0]
+    for chn_id, chain in sorted(model.child_dict.items()):
+        # HETATM mod. res. policy: remove mod if in sequence, else discard
+        residues = [
+            res
+            for res in chain.get_unpacked_list()
+            if _res2aacode(res.get_resname().upper()) != "X"
+        ]
+        if not residues:
+            continue
+        # Identify missing residues in the structure
+        # (fill the sequence with 'X' residues in these regions)
+        gaps = []
+        rnumbers = [r.id[1] for r in residues]
+        for i, rnum in enumerate(rnumbers[:-1]):
+            if rnumbers[i + 1] != rnum + 1 and rnumbers[i + 1] != rnum:
+                # It's a gap!
+                gaps.append((i + 1, rnum, rnumbers[i + 1]))
+        if gaps:
+            res_out = []
+            prev_idx = 0
+            for i, pregap, postgap in gaps:
+                if postgap > pregap:
+                    gapsize = postgap - pregap - 1
+                    res_out.extend(_res2aacode(x) for x in residues[prev_idx:i])
+                    prev_idx = i
+                    res_out.append("X" * gapsize)
+                else:
+                    warnings.warn(
+                        "Ignoring out-of-order residues after a gap",
+                        BiopythonParserWarning,
+                    )
+                    # Keep the normal part, drop the out-of-order segment
+                    # (presumably modified or hetatm residues, e.g. 3BEG)
+                    res_out.extend(_res2aacode(x) for x in residues[prev_idx:i])
+                    break
+            else:
+                # Last segment
+                res_out.extend(_res2aacode(x) for x in residues[prev_idx:])
+        else:
+            # No gaps
+            res_out = [_res2aacode(x) for x in residues]
+        record_id = "%s:%s" % (pdb_id, chn_id)
+        # ENH - model number in SeqRecord id if multiple models?
+        # id = "Chain%s" % str(chain.id)
+        # if len(structure) > 1 :
+        #     id = ("Model%s|" % str(model.id)) + id
+
+        record = SeqRecord(Seq("".join(res_out)), id=record_id, description=record_id)
+        # TODO: Test PDB files with DNA and RNA too:
+        record.annotations["molecule_type"] = "protein"
+
+        record.annotations["model"] = model.id
+        record.annotations["chain"] = chain.id
+
+        record.annotations["start"] = int(rnumbers[0])
+        record.annotations["end"] = int(rnumbers[-1])
+        yield record
+
+
+class PdbSeqresIterator(SequenceIterator):
+    """Parser for PDB files."""
+
+    def __init__(self, source):
+        """Return SeqRecord objects for each chain in a PDB file.
+
+        Arguments:
+         - source - input stream opened in text mode, or a path to a file
+
+        The sequences are derived from the SEQRES lines in the
+        PDB file header, not the atoms of the 3D structure.
+
+        Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES
+
+        See: http://www.wwpdb.org/documentation/format23/sect3.html
+
+        This gets called internally via Bio.SeqIO for the SEQRES based interpretation
+        of the PDB file format:
+
+        >>> from Bio import SeqIO
+        >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-seqres"):
+        ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+        ...     print(record.dbxrefs)
+        ...
+        Record id 1A8O:A, chain A
+        ['UNP:P12497', 'UNP:POL_HV1N5']
+
+        Equivalently,
+
+        >>> with open("PDB/1A8O.pdb") as handle:
+        ...     for record in PdbSeqresIterator(handle):
+        ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+        ...         print(record.dbxrefs)
+        ...
+        Record id 1A8O:A, chain A
+        ['UNP:P12497', 'UNP:POL_HV1N5']
+
+        Note the chain is recorded in the annotations dictionary, and any PDB DBREF
+        lines are recorded in the database cross-references list.
+        """
+        super().__init__(source, mode="t", fmt="PDB")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Iterate over the records in the PDB file."""
+        chains = collections.defaultdict(list)
+        metadata = collections.defaultdict(list)
+
+        rec_name = None
+        for line in handle:
+            rec_name = line[0:6].strip()
+            if rec_name == "SEQRES":
+                # NB: We only actually need chain ID and the residues here;
+                # commented bits are placeholders from the wwPDB spec.
+                # Serial number of the SEQRES record for the current chain.
+                # Starts at 1 and increments by one each line.
+                # Reset to 1 for each chain.
+                # ser_num = int(line[8:10])
+                # Chain identifier. This may be any single legal character,
+                # including a blank which is used if there is only one chain.
+                chn_id = line[11]
+                # Number of residues in the chain (repeated on every record)
+                # num_res = int(line[13:17])
+                residues = [_res2aacode(res) for res in line[19:].split()]
+                chains[chn_id].extend(residues)
+            elif rec_name == "DBREF":
+                #  ID code of this entry (PDB ID)
+                pdb_id = line[7:11]
+                # Chain identifier.
+                chn_id = line[12]
+                # Initial sequence number of the PDB sequence segment.
+                # seq_begin = int(line[14:18])
+                # Initial insertion code of the PDB sequence segment.
+                # icode_begin = line[18]
+                # Ending sequence number of the PDB sequence segment.
+                # seq_end = int(line[20:24])
+                # Ending insertion code of the PDB sequence segment.
+                # icode_end = line[24]
+                # Sequence database name.
+                database = line[26:32].strip()
+                # Sequence database accession code.
+                db_acc = line[33:41].strip()
+                # Sequence database identification code.
+                db_id_code = line[42:54].strip()
+                # Initial sequence number of the database seqment.
+                # db_seq_begin = int(line[55:60])
+                # Insertion code of initial residue of the segment, if PDB is the
+                # reference.
+                # db_icode_begin = line[60]
+                # Ending sequence number of the database segment.
+                # db_seq_end = int(line[62:67])
+                # Insertion code of the ending residue of the segment, if PDB is the
+                # reference.
+                # db_icode_end = line[67]
+                metadata[chn_id].append(
+                    {
+                        "pdb_id": pdb_id,
+                        "database": database,
+                        "db_acc": db_acc,
+                        "db_id_code": db_id_code,
+                    }
+                )
+            # ENH: 'SEQADV' 'MODRES'
+
+        if rec_name is None:
+            raise ValueError("Empty file.")
+
+        for chn_id, residues in sorted(chains.items()):
+            record = SeqRecord(Seq("".join(residues)))
+            record.annotations = {"chain": chn_id}
+            # TODO: Test PDB files with DNA and RNA too:
+            record.annotations["molecule_type"] = "protein"
+            if chn_id in metadata:
+                m = metadata[chn_id][0]
+                record.id = record.name = "%s:%s" % (m["pdb_id"], chn_id)
+                record.description = "%s:%s %s" % (
+                    m["database"],
+                    m["db_acc"],
+                    m["db_id_code"],
+                )
+                for melem in metadata[chn_id]:
+                    record.dbxrefs.extend(
+                        [
+                            "%s:%s" % (melem["database"], melem["db_acc"]),
+                            "%s:%s" % (melem["database"], melem["db_id_code"]),
+                        ]
+                    )
+            else:
+                record.id = chn_id
+            yield record
+
+
+def PdbAtomIterator(source):
+    """Return SeqRecord objects for each chain in a PDB file.
+
+    Argument source is a file-like object or a path to a file.
+
+    The sequences are derived from the 3D structure (ATOM records), not the
+    SEQRES lines in the PDB file header.
+
+    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
+    are converted to "X" in the sequence.
+
+    In addition to information from the PDB header (which is the same for all
+    records), the following chain specific information is placed in the
+    annotation:
+
+    record.annotations["residues"] = List of residue ID strings
+    record.annotations["chain"] = Chain ID (typically A, B ,...)
+    record.annotations["model"] = Model ID (typically zero)
+
+    Where amino acids are missing from the structure, as indicated by residue
+    numbering, the sequence is filled in with 'X' characters to match the size
+    of the missing region, and  None is included as the corresponding entry in
+    the list record.annotations["residues"].
+
+    This function uses the Bio.PDB module to do most of the hard work. The
+    annotation information could be improved but this extra parsing should be
+    done in parse_pdb_header, not this module.
+
+    This gets called internally via Bio.SeqIO for the atom based interpretation
+    of the PDB file format:
+
+    >>> from Bio import SeqIO
+    >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-atom"):
+    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+    ...
+    Record id 1A8O:A, chain A
+
+    Equivalently,
+
+    >>> with open("PDB/1A8O.pdb") as handle:
+    ...     for record in PdbAtomIterator(handle):
+    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+    ...
+    Record id 1A8O:A, chain A
+
+    """
+    # TODO - Add record.annotations to the doctest, esp the residues (not working?)
+
+    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
+    from Bio.PDB import PDBParser
+
+    structure = PDBParser().get_structure(None, source)
+    pdb_id = structure.header["idcode"]
+    if not pdb_id:
+        warnings.warn(
+            "'HEADER' line not found; can't determine PDB ID.", BiopythonParserWarning
+        )
+        pdb_id = "????"
+
+    for record in AtomIterator(pdb_id, structure):
+        # The PDB header was loaded as a dictionary, so let's reuse it all
+        record.annotations.update(structure.header)
+
+        # ENH - add letter annotations -- per-residue info, e.g. numbers
+
+        yield record
+
+
+PDBX_POLY_SEQ_SCHEME_FIELDS = (
+    "_pdbx_poly_seq_scheme.asym_id",  # Chain ID
+    "_pdbx_poly_seq_scheme.mon_id",  # Residue type
+)
+
+STRUCT_REF_FIELDS = (
+    "_struct_ref.id",  # ID of this reference
+    "_struct_ref.db_name",  # Name of the database
+    "_struct_ref.db_code",  # Code for this entity
+    "_struct_ref.pdbx_db_accession",  # DB accession ID of ref
+)
+
+STRUCT_REF_SEQ_FIELDS = (
+    "_struct_ref_seq.ref_id",  # Pointer to _struct_ref
+    "_struct_ref_seq.pdbx_PDB_id_code",  # PDB ID of this structure
+    "_struct_ref_seq.pdbx_strand_id",  # Chain ID of the reference
+)
+
+
+def CifSeqresIterator(source):
+    """Return SeqRecord objects for each chain in an mmCIF file.
+
+    Argument source is a file-like object or a path to a file.
+
+    The sequences are derived from the _entity_poly_seq entries in the mmCIF
+    file, not the atoms of the 3D structure.
+
+    Specifically, these mmCIF records are handled: _pdbx_poly_seq_scheme and
+    _struct_ref_seq. The _pdbx_poly_seq records contain sequence information,
+    and the _struct_ref_seq records contain database cross-references.
+
+    See:
+    http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v40.dic/Categories/pdbx_poly_seq_scheme.html
+    and
+    http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html
+
+    This gets called internally via Bio.SeqIO for the sequence-based
+    interpretation of the mmCIF file format:
+
+    >>> from Bio import SeqIO
+    >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-seqres"):
+    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+    ...     print(record.dbxrefs)
+    ...
+    Record id 1A8O:A, chain A
+    ['UNP:P12497', 'UNP:POL_HV1N5']
+
+    Equivalently,
+
+    >>> with open("PDB/1A8O.cif") as handle:
+    ...     for record in CifSeqresIterator(handle):
+    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+    ...         print(record.dbxrefs)
+    ...
+    Record id 1A8O:A, chain A
+    ['UNP:P12497', 'UNP:POL_HV1N5']
+
+    Note the chain is recorded in the annotations dictionary, and any mmCIF
+    _struct_ref_seq entries are recorded in the database cross-references list.
+    """
+    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
+    from Bio.PDB.MMCIF2Dict import MMCIF2Dict
+
+    chains = collections.defaultdict(list)
+    metadata = collections.defaultdict(list)
+    records = MMCIF2Dict(source)
+
+    # Explicitly convert records to list (See #1533).
+    # If an item is not present, use an empty list
+    for field in (
+        PDBX_POLY_SEQ_SCHEME_FIELDS + STRUCT_REF_SEQ_FIELDS + STRUCT_REF_FIELDS
+    ):
+        if field not in records:
+            records[field] = []
+        elif not isinstance(records[field], list):
+            records[field] = [records[field]]
+
+    for asym_id, mon_id in zip(
+        records["_pdbx_poly_seq_scheme.asym_id"],
+        records["_pdbx_poly_seq_scheme.mon_id"],
+    ):
+        mon_id_1l = _res2aacode(mon_id)
+        chains[asym_id].append(mon_id_1l)
+
+    # Build a dict of _struct_ref records, indexed by the id field:
+    struct_refs = {}
+    for ref_id, db_name, db_code, db_acc in zip(
+        records["_struct_ref.id"],
+        records["_struct_ref.db_name"],
+        records["_struct_ref.db_code"],
+        records["_struct_ref.pdbx_db_accession"],
+    ):
+        struct_refs[ref_id] = {
+            "database": db_name,
+            "db_id_code": db_code,
+            "db_acc": db_acc,
+        }
+
+    # Look through _struct_ref_seq records, look up the corresponding
+    # _struct_ref and add an entry to the metadata list for this chain.
+    for ref_id, pdb_id, chain_id in zip(
+        records["_struct_ref_seq.ref_id"],
+        records["_struct_ref_seq.pdbx_PDB_id_code"],
+        records["_struct_ref_seq.pdbx_strand_id"],
+    ):
+        struct_ref = struct_refs[ref_id]
+
+        # The names here mirror those in PdbIO
+        metadata[chain_id].append({"pdb_id": pdb_id})
+        metadata[chain_id][-1].update(struct_ref)
+
+    for chn_id, residues in sorted(chains.items()):
+        record = SeqRecord(Seq("".join(residues)))
+        record.annotations = {"chain": chn_id}
+        # TODO: Test PDB files with DNA and RNA too:
+        record.annotations["molecule_type"] = "protein"
+        if chn_id in metadata:
+            m = metadata[chn_id][0]
+            record.id = record.name = "%s:%s" % (m["pdb_id"], chn_id)
+            record.description = "%s:%s %s" % (
+                m["database"],
+                m["db_acc"],
+                m["db_id_code"],
+            )
+            for melem in metadata[chn_id]:
+                record.dbxrefs.extend(
+                    [
+                        "%s:%s" % (melem["database"], melem["db_acc"]),
+                        "%s:%s" % (melem["database"], melem["db_id_code"]),
+                    ]
+                )
+        else:
+            record.id = chn_id
+        yield record
+
+
+def CifAtomIterator(source):
+    """Return SeqRecord objects for each chain in an mmCIF file.
+
+    Argument source is a file-like object or a path to a file.
+
+    The sequences are derived from the 3D structure (_atom_site.* fields)
+    in the mmCIF file.
+
+    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
+    are converted to "X" in the sequence.
+
+    In addition to information from the PDB header (which is the same for all
+    records), the following chain specific information is placed in the
+    annotation:
+
+    record.annotations["residues"] = List of residue ID strings
+    record.annotations["chain"] = Chain ID (typically A, B ,...)
+    record.annotations["model"] = Model ID (typically zero)
+
+    Where amino acids are missing from the structure, as indicated by residue
+    numbering, the sequence is filled in with 'X' characters to match the size
+    of the missing region, and  None is included as the corresponding entry in
+    the list record.annotations["residues"].
+
+    This function uses the Bio.PDB module to do most of the hard work. The
+    annotation information could be improved but this extra parsing should be
+    done in parse_pdb_header, not this module.
+
+    This gets called internally via Bio.SeqIO for the atom based interpretation
+    of the PDB file format:
+
+    >>> from Bio import SeqIO
+    >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-atom"):
+    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+    ...
+    Record id 1A8O:A, chain A
+
+    Equivalently,
+
+    >>> with open("PDB/1A8O.cif") as handle:
+    ...     for record in CifAtomIterator(handle):
+    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+    ...
+    Record id 1A8O:A, chain A
+
+    """
+    # TODO - Add record.annotations to the doctest, esp the residues (not working?)
+
+    # Only import parser when needed, to avoid/delay NumPy dependency in SeqIO
+    from Bio.PDB.MMCIFParser import MMCIFParser
+
+    structure = MMCIFParser().get_structure(None, source)
+    pdb_id = structure.header["idcode"]
+    if not pdb_id:
+        warnings.warn("Could not determine the PDB ID.", BiopythonParserWarning)
+        pdb_id = "????"
+    yield from AtomIterator(pdb_id, structure)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/PhdIO.py b/code/lib/Bio/SeqIO/PhdIO.py
new file mode 100644
index 0000000..4e97704
--- /dev/null
+++ b/code/lib/Bio/SeqIO/PhdIO.py
@@ -0,0 +1,158 @@
+# Copyright 2008-2016 by Peter Cock.  All rights reserved.
+# Revisions copyright 2009 by Cymon J. Cox.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "phd" file format.
+
+PHD files are output by PHRED and used by PHRAP and CONSED.
+
+You are expected to use this module via the Bio.SeqIO functions, under the
+format name "phd". See also the underlying Bio.Sequencing.Phd module.
+
+For example, using Bio.SeqIO we can read in one of the example PHRED files
+from the Biopython unit tests:
+
+    >>> from Bio import SeqIO
+    >>> for record in SeqIO.parse("Phd/phd1", "phd"):
+    ...     print(record.id)
+    ...     print("%s..." % record.seq[:10])
+    ...     print("%s..." % record.letter_annotations["phred_quality"][:10])
+    34_222_(80-A03-19).b.ab1
+    ctccgtcgga...
+    [9, 9, 10, 19, 22, 37, 28, 28, 24, 22]...
+    425_103_(81-A03-19).g.ab1
+    cgggatccca...
+    [14, 17, 22, 10, 10, 10, 15, 8, 8, 9]...
+    425_7_(71-A03-19).b.ab1
+    acataaatca...
+    [10, 10, 10, 10, 8, 8, 6, 6, 6, 6]...
+
+Since PHRED files contain quality scores, you can save them as FASTQ or as
+QUAL files, for example using Bio.SeqIO.write(...), or simply with the format
+method of the SeqRecord object:
+
+    >>> print(record[:50].format("fastq"))
+    @425_7_(71-A03-19).b.ab1
+    acataaatcaaattactnaccaacacacaaaccngtctcgcgtagtggag
+    +
+    ++++))'''')(''')$!$''')''''(+.''$!$))))+)))'''''''
+    
+
+Or,
+
+    >>> print(record[:50].format("qual"))
+    >425_7_(71-A03-19).b.ab1
+    10 10 10 10 8 8 6 6 6 6 8 7 6 6 6 8 3 0 3 6 6 6 8 6 6 6 6 7
+    10 13 6 6 3 0 3 8 8 8 8 10 8 8 8 6 6 6 6 6 6 6
+    
+
+Note these examples only show the first 50 bases to keep the output short.
+"""
+from Bio.SeqIO import QualityIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Sequencing import Phd
+
+from .Interfaces import SequenceWriter
+
+
+def PhdIterator(source):
+    """Return SeqRecord objects from a PHD file.
+
+    Arguments:
+     - source - input stream opened in text mode, or a path to a file
+
+    This uses the Bio.Sequencing.Phd module to do the hard work.
+    """
+    phd_records = Phd.parse(source)
+    for phd_record in phd_records:
+        # Convert the PHY record into a SeqRecord...
+        # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
+        # from unit test example file phd_solexa.
+        # This will cause problems if used as the record identifier
+        # (e.g. output for FASTQ format).
+        name = phd_record.file_name.split(None, 1)[0]
+        seq_record = SeqRecord(
+            phd_record.seq, id=name, name=name, description=phd_record.file_name
+        )
+        # Just re-use the comments dictionary as the SeqRecord's annotations
+        seq_record.annotations = phd_record.comments
+        seq_record.annotations["molecule_type"] = "DNA"
+        # And store the qualities and peak locations as per-letter-annotation
+        seq_record.letter_annotations["phred_quality"] = [
+            int(site[1]) for site in phd_record.sites
+        ]
+        try:
+            seq_record.letter_annotations["peak_location"] = [
+                int(site[2]) for site in phd_record.sites
+            ]
+        except IndexError:
+            # peak locations are not always there according to
+            # David Gordon (the Consed author)
+            pass
+        yield seq_record
+    # All done
+
+
+class PhdWriter(SequenceWriter):
+    """Class to write Phd format files."""
+
+    def __init__(self, handle):
+        """Initialize the class."""
+        super().__init__(handle)
+
+    def write_record(self, record):
+        """Write a single Phd record to the file."""
+        assert record.seq, "No sequence present in SeqRecord"
+        # This method returns the 'phred_quality' scores or converted
+        # 'solexa_quality' scores if present, else raises a value error
+        phred_qualities = QualityIO._get_phred_quality(record)
+        peak_locations = record.letter_annotations.get("peak_location")
+        if len(record.seq) != len(phred_qualities):
+            raise ValueError(
+                "Number of phd quality scores does not match length of sequence"
+            )
+        if peak_locations:
+            if len(record.seq) != len(peak_locations):
+                raise ValueError(
+                    "Number of peak location scores does not "
+                    "match length of sequence"
+                )
+        if None in phred_qualities:
+            raise ValueError("A quality value of None was found")
+        if record.description.startswith("%s " % record.id):
+            title = record.description
+        else:
+            title = "%s %s" % (record.id, record.description)
+        self.handle.write("BEGIN_SEQUENCE %s\nBEGIN_COMMENT\n" % self.clean(title))
+        for annot in [k.lower() for k in Phd.CKEYWORDS]:
+            value = None
+            if annot == "trim":
+                if record.annotations.get("trim"):
+                    value = "%s %s %.4f" % record.annotations["trim"]
+            elif annot == "trace_peak_area_ratio":
+                if record.annotations.get("trace_peak_area_ratio"):
+                    value = "%.4f" % record.annotations["trace_peak_area_ratio"]
+            else:
+                value = record.annotations.get(annot)
+            if value or value == 0:
+                self.handle.write("%s: %s\n" % (annot.upper(), value))
+
+        self.handle.write("END_COMMENT\nBEGIN_DNA\n")
+        for i, site in enumerate(record.seq):
+            if peak_locations:
+                self.handle.write(
+                    "%s %i %i\n" % (site, round(phred_qualities[i]), peak_locations[i])
+                )
+            else:
+                self.handle.write("%s %i\n" % (site, round(phred_qualities[i])))
+
+        self.handle.write("END_DNA\nEND_SEQUENCE\n")
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqIO/PirIO.py b/code/lib/Bio/SeqIO/PirIO.py
new file mode 100644
index 0000000..7f3ae07
--- /dev/null
+++ b/code/lib/Bio/SeqIO/PirIO.py
@@ -0,0 +1,292 @@
+# Copyright 2008-2015 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
+
+This module is for reading and writing PIR or NBRF format files as
+SeqRecord objects.
+
+You are expected to use this module via the Bio.SeqIO functions, or if
+the file contains a sequence alignment, optionally via Bio.AlignIO instead.
+
+This format was introduced for the Protein Information Resource (PIR), a
+project of the National Biomedical Research Foundation (NBRF).  The PIR
+database itself is now part of UniProt.
+
+The file format is described online at:
+http://www.ebi.ac.uk/help/pir_frame.html
+http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
+
+An example file in this format would be::
+
+  >P1;CRAB_ANAPL
+  ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
+    MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
+    SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
+    GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
+    SDVPERSIPI TREEKPAIAG AQRK*
+
+  >P1;CRAB_BOVIN
+  ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
+    MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
+    PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
+    HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
+    QASGPERTIP ITREEKPAVT AAPKK*
+
+Or, an example of a multiple sequence alignment::
+
+  >P1;S27231
+  rhodopsin - northern leopard frog
+  MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
+  VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
+  GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
+  EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
+  ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
+  YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
+
+  >P1;I51200
+  rhodopsin - African clawed frog
+  MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
+  VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
+  GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
+  EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
+  LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
+  YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
+
+  >P1;JN0120
+  rhodopsin - Japanese lamprey
+  MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
+  VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
+  GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
+  EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
+  ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
+  YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
+
+
+As with the FASTA format, each record starts with a line beginning with ">"
+character.  There is then a two letter sequence type (P1, F1, DL, DC, RL,
+RC, or XX), a semi colon, and the identification code.  The second like is
+free text description.  The remaining lines contain the sequence itself,
+terminating in an asterisk.  Space separated blocks of ten letters as shown
+above are typical.
+
+Sequence codes and their meanings:
+ - P1 - Protein (complete)
+ - F1 - Protein (fragment)
+ - D1 - DNA (e.g. EMBOSS seqret output)
+ - DL - DNA (linear)
+ - DC - DNA (circular)
+ - RL - RNA (linear)
+ - RC - RNA (circular)
+ - N3 - tRNA
+ - N1 - Other functional RNA
+ - XX - Unknown
+
+"""
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import _get_seq_string
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+_pir_mol_type = {
+    "P1": "protein",
+    "F1": "protein",
+    "D1": "DNA",
+    "DL": "DNA",
+    "DC": "DNA",
+    "RL": "RNA",
+    "RC": "RNA",
+    "N3": "RNA",
+    "XX": None,
+}
+
+
+class PirIterator(SequenceIterator):
+    """Parser for PIR files."""
+
+    def __init__(self, source):
+        """Iterate over a PIR file and yield SeqRecord objects.
+
+        source - file-like object or a path to a file.
+
+        Examples
+        --------
+        >>> with open("NBRF/DMB_prot.pir") as handle:
+        ...    for record in PirIterator(handle):
+        ...        print("%s length %i" % (record.id, len(record)))
+        HLA:HLA00489 length 263
+        HLA:HLA00490 length 94
+        HLA:HLA00491 length 94
+        HLA:HLA00492 length 80
+        HLA:HLA00493 length 175
+        HLA:HLA01083 length 188
+
+        """
+        super().__init__(source, mode="t", fmt="Pir")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Iterate over the records in the PIR file."""
+        # Skip any text before the first record (e.g. blank lines, comments)
+        for line in handle:
+            if line[0] == ">":
+                break
+        else:
+            return  # Premature end of file, or just empty?
+
+        while True:
+            pir_type = line[1:3]
+            if pir_type not in _pir_mol_type or line[3] != ";":
+                raise ValueError(
+                    "Records should start with '>XX;' where XX is a valid sequence type"
+                )
+            identifier = line[4:].strip()
+            description = handle.readline().strip()
+
+            lines = []
+            for line in handle:
+                if line[0] == ">":
+                    break
+                # Remove trailing whitespace, and any internal spaces
+                lines.append(line.rstrip().replace(" ", ""))
+            else:
+                line = None
+            seq = "".join(lines)
+            if seq[-1] != "*":
+                # Note the * terminator is present on nucleotide sequences too,
+                # it is not a stop codon!
+                raise ValueError(
+                    "Sequences in PIR files should include a * terminator!"
+                )
+
+            # Return the record and then continue...
+            record = SeqRecord(
+                Seq(seq[:-1]), id=identifier, name=identifier, description=description,
+            )
+            record.annotations["PIR-type"] = pir_type
+            if _pir_mol_type[pir_type]:
+                record.annotations["molecule_type"] = _pir_mol_type[pir_type]
+            yield record
+
+            if line is None:
+                return  # StopIteration
+        raise ValueError("Unrecognised PIR record format.")
+
+
+class PirWriter(SequenceWriter):
+    """Class to write PIR format files."""
+
+    def __init__(self, handle, wrap=60, record2title=None, code=None):
+        """Create a PIR writer.
+
+        Arguments:
+         - handle - Handle to an output file, e.g. as returned
+           by open(filename, "w")
+         - wrap - Optional line length used to wrap sequence lines.
+           Defaults to wrapping the sequence at 60 characters
+           Use zero (or None) for no wrapping, giving a single
+           long line for the sequence.
+         - record2title - Optional function to return the text to be
+           used for the title line of each record.  By default
+           a combination of the record.id, record.name and
+           record.description is used.
+         - code - Optional sequence code must be one of P1, F1,
+           D1, DL, DC, RL, RC, N3 and XX. By default None is used,
+           which means auto detection based on the molecule type
+           in the record annotation.
+
+        You can either use::
+
+            handle = open(filename, "w")
+            writer = PirWriter(handle)
+            writer.write_file(myRecords)
+            handle.close()
+
+        Or, follow the sequential file writer system, for example::
+
+            handle = open(filename, "w")
+            writer = PirWriter(handle)
+            writer.write_header() # does nothing for PIR files
+            ...
+            Multiple writer.write_record() and/or writer.write_records() calls
+            ...
+            writer.write_footer() # does nothing for PIR files
+            handle.close()
+
+        """
+        super().__init__(handle)
+        self.wrap = None
+        if wrap:
+            if wrap < 1:
+                raise ValueError("wrap should be None, 0, or a positive integer")
+        self.wrap = wrap
+        self.record2title = record2title
+        self.code = code
+
+    def write_record(self, record):
+        """Write a single PIR record to the file."""
+        if self.record2title:
+            title = self.clean(self.record2title(record))
+        else:
+            title = self.clean(record.id)
+
+        if record.name and record.description:
+            description = self.clean(record.name + " - " + record.description)
+        elif record.name and not record.description:
+            description = self.clean(record.name)
+        else:
+            description = self.clean(record.description)
+
+        if self.code:
+            code = self.code
+        else:
+            molecule_type = record.annotations.get("molecule_type")
+            if molecule_type is None:
+                code = "XX"
+            elif "DNA" in molecule_type:
+                code = "D1"
+            elif "RNA" in molecule_type:
+                code = "RL"
+            elif "protein" in molecule_type:
+                code = "P1"
+            else:
+                code = "XX"
+
+        if code not in _pir_mol_type:
+            raise TypeError(
+                "Sequence code must be one of " + _pir_mol_type.keys() + "."
+            )
+        assert "\n" not in title
+        assert "\r" not in description
+
+        self.handle.write(">%s;%s\n%s\n" % (code, title, description))
+
+        data = _get_seq_string(record)  # Catches sequence being None
+
+        assert "\n" not in data
+        assert "\r" not in data
+
+        if self.wrap:
+            line = ""
+            for i in range(0, len(data), self.wrap):
+                line += data[i : i + self.wrap] + "\n"
+            line = line[:-1] + "*\n"
+            self.handle.write(line)
+        else:
+            self.handle.write(data + "*\n")
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/QualityIO.py b/code/lib/Bio/SeqIO/QualityIO.py
new file mode 100644
index 0000000..e1e8d4c
--- /dev/null
+++ b/code/lib/Bio/SeqIO/QualityIO.py
@@ -0,0 +1,2297 @@
+# Copyright 2009-2020 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the FASTQ and QUAL file formats.
+
+Note that you are expected to use this code via the Bio.SeqIO interface, as
+shown below.
+
+The FASTQ file format is used frequently at the Wellcome Trust Sanger Institute
+to bundle a FASTA sequence and its PHRED quality data (integers between 0 and
+90).  Rather than using a single FASTQ file, often paired FASTA and QUAL files
+are used containing the sequence and the quality information separately.
+
+The PHRED software reads DNA sequencing trace files, calls bases, and
+assigns a non-negative quality value to each called base using a logged
+transformation of the error probability, Q = -10 log10( Pe ), for example::
+
+    Pe = 1.0,         Q =  0
+    Pe = 0.1,         Q = 10
+    Pe = 0.01,        Q = 20
+    ...
+    Pe = 0.00000001,  Q = 80
+    Pe = 0.000000001, Q = 90
+
+In typical raw sequence reads, the PHRED quality valuea will be from 0 to 40.
+In the QUAL format these quality values are held as space separated text in
+a FASTA like file format.  In the FASTQ format, each quality values is encoded
+with a single ASCI character using chr(Q+33), meaning zero maps to the
+character "!" and for example 80 maps to "q".  For the Sanger FASTQ standard
+the allowed range of PHRED scores is 0 to 93 inclusive. The sequences and
+quality are then stored in pairs in a FASTA like format.
+
+Unfortunately there is no official document describing the FASTQ file format,
+and worse, several related but different variants exist. For more details,
+please read this open access publication::
+
+    The Sanger FASTQ file format for sequences with quality scores, and the
+    Solexa/Illumina FASTQ variants.
+    P.J.A.Cock (Biopython), C.J.Fields (BioPerl), N.Goto (BioRuby),
+    M.L.Heuer (BioJava) and P.M. Rice (EMBOSS).
+    Nucleic Acids Research 2010 38(6):1767-1771
+    https://doi.org/10.1093/nar/gkp1137
+
+The good news is that Roche 454 sequencers can output files in the QUAL format,
+and sensibly they use PHREP style scores like Sanger.  Converting a pair of
+FASTA and QUAL files into a Sanger style FASTQ file is easy. To extract QUAL
+files from a Roche 454 SFF binary file, use the Roche off instrument command
+line tool "sffinfo" with the -q or -qual argument.  You can extract a matching
+FASTA file using the -s or -seq argument instead.
+
+The bad news is that Solexa/Illumina did things differently - they have their
+own scoring system AND their own incompatible versions of the FASTQ format.
+Solexa/Illumina quality scores use Q = - 10 log10 ( Pe / (1-Pe) ), which can
+be negative.  PHRED scores and Solexa scores are NOT interchangeable (but a
+reasonable mapping can be achieved between them, and they are approximately
+equal for higher quality reads).
+
+Confusingly early Solexa pipelines produced a FASTQ like file but using their
+own score mapping and an ASCII offset of 64. To make things worse, for the
+Solexa/Illumina pipeline 1.3 onwards, they introduced a third variant of the
+FASTQ file format, this time using PHRED scores (which is more consistent) but
+with an ASCII offset of 64.
+
+i.e. There are at least THREE different and INCOMPATIBLE variants of the FASTQ
+file format: The original Sanger PHRED standard, and two from Solexa/Illumina.
+
+The good news is that as of CASAVA version 1.8, Illumina sequencers will
+produce FASTQ files using the standard Sanger encoding.
+
+You are expected to use this module via the Bio.SeqIO functions, with the
+following format names:
+
+    - "qual" means simple quality files using PHRED scores (e.g. from Roche 454)
+    - "fastq" means Sanger style FASTQ files using PHRED scores and an ASCII
+      offset of 33 (e.g. from the NCBI Short Read Archive and Illumina 1.8+).
+      These can potentially hold PHRED scores from 0 to 93.
+    - "fastq-sanger" is an alias for "fastq".
+    - "fastq-solexa" means old Solexa (and also very early Illumina) style FASTQ
+      files, using Solexa scores with an ASCII offset 64. These can hold Solexa
+      scores from -5 to 62.
+    - "fastq-illumina" means newer Illumina 1.3 to 1.7 style FASTQ files, using
+      PHRED scores but with an ASCII offset 64, allowing PHRED scores from 0
+      to 62.
+
+We could potentially add support for "qual-solexa" meaning QUAL files which
+contain Solexa scores, but thus far there isn't any reason to use such files.
+
+For example, consider the following short FASTQ file::
+
+    @EAS54_6_R1_2_1_413_324
+    CCCTTCTTGTCTTCAGCGTTTCTCC
+    +
+    ;;3;;;;;;;;;;;;7;;;;;;;88
+    @EAS54_6_R1_2_1_540_792
+    TTGGCAGGCCAAGGCCGATGGATCA
+    +
+    ;;;;;;;;;;;7;;;;;-;;;3;83
+    @EAS54_6_R1_2_1_443_348
+    GTTGCTTCTGGCGTGGGTGGGGGGG
+    +
+    ;;;;;;;;;;;9;7;;.7;393333
+
+This contains three reads of length 25.  From the read length these were
+probably originally from an early Solexa/Illumina sequencer but this file
+follows the Sanger FASTQ convention (PHRED style qualities with an ASCII
+offet of 33).  This means we can parse this file using Bio.SeqIO using
+"fastq" as the format name:
+
+>>> from Bio import SeqIO
+>>> for record in SeqIO.parse("Quality/example.fastq", "fastq"):
+...     print("%s %s" % (record.id, record.seq))
+EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
+EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
+EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG
+
+The qualities are held as a list of integers in each record's annotation:
+
+>>> print(record)
+ID: EAS54_6_R1_2_1_443_348
+Name: EAS54_6_R1_2_1_443_348
+Description: EAS54_6_R1_2_1_443_348
+Number of features: 0
+Per letter annotation for: phred_quality
+Seq('GTTGCTTCTGGCGTGGGTGGGGGGG')
+>>> print(record.letter_annotations["phred_quality"])
+[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+You can use the SeqRecord format method to show this in the QUAL format:
+
+>>> print(record.format("qual"))
+>EAS54_6_R1_2_1_443_348
+26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18
+24 18 18 18 18
+
+
+Or go back to the FASTQ format, use "fastq" (or "fastq-sanger"):
+
+>>> print(record.format("fastq"))
+@EAS54_6_R1_2_1_443_348
+GTTGCTTCTGGCGTGGGTGGGGGGG
++
+;;;;;;;;;;;9;7;;.7;393333
+
+
+Or, using the Illumina 1.3+ FASTQ encoding (PHRED values with an ASCII offset
+of 64):
+
+>>> print(record.format("fastq-illumina"))
+@EAS54_6_R1_2_1_443_348
+GTTGCTTCTGGCGTGGGTGGGGGGG
++
+ZZZZZZZZZZZXZVZZMVZRXRRRR
+
+
+You can also get Biopython to convert the scores and show a Solexa style
+FASTQ file:
+
+>>> print(record.format("fastq-solexa"))
+@EAS54_6_R1_2_1_443_348
+GTTGCTTCTGGCGTGGGTGGGGGGG
++
+ZZZZZZZZZZZXZVZZMVZRXRRRR
+
+
+Notice that this is actually the same output as above using "fastq-illumina"
+as the format! The reason for this is all these scores are high enough that
+the PHRED and Solexa scores are almost equal. The differences become apparent
+for poor quality reads. See the functions solexa_quality_from_phred and
+phred_quality_from_solexa for more details.
+
+If you wanted to trim your sequences (perhaps to remove low quality regions,
+or to remove a primer sequence), try slicing the SeqRecord objects.  e.g.
+
+>>> sub_rec = record[5:15]
+>>> print(sub_rec)
+ID: EAS54_6_R1_2_1_443_348
+Name: EAS54_6_R1_2_1_443_348
+Description: EAS54_6_R1_2_1_443_348
+Number of features: 0
+Per letter annotation for: phred_quality
+Seq('TTCTGGCGTG')
+>>> print(sub_rec.letter_annotations["phred_quality"])
+[26, 26, 26, 26, 26, 26, 24, 26, 22, 26]
+>>> print(sub_rec.format("fastq"))
+@EAS54_6_R1_2_1_443_348
+TTCTGGCGTG
++
+;;;;;;9;7;
+
+
+If you wanted to, you could read in this FASTQ file, and save it as a QUAL file:
+
+>>> from Bio import SeqIO
+>>> record_iterator = SeqIO.parse("Quality/example.fastq", "fastq")
+>>> with open("Quality/temp.qual", "w") as out_handle:
+...     SeqIO.write(record_iterator, out_handle, "qual")
+3
+
+You can of course read in a QUAL file, such as the one we just created:
+
+>>> from Bio import SeqIO
+>>> for record in SeqIO.parse("Quality/temp.qual", "qual"):
+...     print("%s read of length %d" % (record.id, len(record.seq)))
+EAS54_6_R1_2_1_413_324 read of length 25
+EAS54_6_R1_2_1_540_792 read of length 25
+EAS54_6_R1_2_1_443_348 read of length 25
+
+Notice that QUAL files don't have a proper sequence present!  But the quality
+information is there:
+
+>>> print(record)
+ID: EAS54_6_R1_2_1_443_348
+Name: EAS54_6_R1_2_1_443_348
+Description: EAS54_6_R1_2_1_443_348
+Number of features: 0
+Per letter annotation for: phred_quality
+Undefined sequence of length 25
+>>> print(record.letter_annotations["phred_quality"])
+[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+Just to keep things tidy, if you are following this example yourself, you can
+delete this temporary file now:
+
+>>> import os
+>>> os.remove("Quality/temp.qual")
+
+Sometimes you won't have a FASTQ file, but rather just a pair of FASTA and QUAL
+files.  Because the Bio.SeqIO system is designed for reading single files, you
+would have to read the two in separately and then combine the data.  However,
+since this is such a common thing to want to do, there is a helper iterator
+defined in this module that does this for you - PairedFastaQualIterator.
+
+Alternatively, if you have enough RAM to hold all the records in memory at once,
+then a simple dictionary approach would work:
+
+>>> from Bio import SeqIO
+>>> reads = SeqIO.to_dict(SeqIO.parse("Quality/example.fasta", "fasta"))
+>>> for rec in SeqIO.parse("Quality/example.qual", "qual"):
+...     reads[rec.id].letter_annotations["phred_quality"]=rec.letter_annotations["phred_quality"]
+
+You can then access any record by its key, and get both the sequence and the
+quality scores.
+
+>>> print(reads["EAS54_6_R1_2_1_540_792"].format("fastq"))
+@EAS54_6_R1_2_1_540_792
+TTGGCAGGCCAAGGCCGATGGATCA
++
+;;;;;;;;;;;7;;;;;-;;;3;83
+
+
+It is important that you explicitly tell Bio.SeqIO which FASTQ variant you are
+using ("fastq" or "fastq-sanger" for the Sanger standard using PHRED values,
+"fastq-solexa" for the original Solexa/Illumina variant, or "fastq-illumina"
+for the more recent variant), as this cannot be detected reliably
+automatically.
+
+To illustrate this problem, let's consider an artificial example:
+
+>>> from Bio.Seq import Seq
+>>> from Bio.SeqRecord import SeqRecord
+>>> test = SeqRecord(Seq("NACGTACGTA"), id="Test", description="Made up!")
+>>> print(test.format("fasta"))
+>Test Made up!
+NACGTACGTA
+
+>>> print(test.format("fastq"))
+Traceback (most recent call last):
+ ...
+ValueError: No suitable quality scores found in letter_annotations of SeqRecord (id=Test).
+
+We created a sample SeqRecord, and can show it in FASTA format - but for QUAL
+or FASTQ format we need to provide some quality scores. These are held as a
+list of integers (one for each base) in the letter_annotations dictionary:
+
+>>> test.letter_annotations["phred_quality"] = [0, 1, 2, 3, 4, 5, 10, 20, 30, 40]
+>>> print(test.format("qual"))
+>Test Made up!
+0 1 2 3 4 5 10 20 30 40
+
+>>> print(test.format("fastq"))
+@Test Made up!
+NACGTACGTA
++
+!"#$%&+5?I
+
+
+We can check this FASTQ encoding - the first PHRED quality was zero, and this
+mapped to a exclamation mark, while the final score was 40 and this mapped to
+the letter "I":
+
+>>> ord('!') - 33
+0
+>>> ord('I') - 33
+40
+>>> [ord(letter)-33 for letter in '!"#$%&+5?I']
+[0, 1, 2, 3, 4, 5, 10, 20, 30, 40]
+
+Similarly, we could produce an Illumina 1.3 to 1.7 style FASTQ file using PHRED
+scores with an offset of 64:
+
+>>> print(test.format("fastq-illumina"))
+@Test Made up!
+NACGTACGTA
++
+@ABCDEJT^h
+
+
+And we can check this too - the first PHRED score was zero, and this mapped to
+"@", while the final score was 40 and this mapped to "h":
+
+>>> ord("@") - 64
+0
+>>> ord("h") - 64
+40
+>>> [ord(letter)-64 for letter in "@ABCDEJT^h"]
+[0, 1, 2, 3, 4, 5, 10, 20, 30, 40]
+
+Notice how different the standard Sanger FASTQ and the Illumina 1.3 to 1.7 style
+FASTQ files look for the same data! Then we have the older Solexa/Illumina
+format to consider which encodes Solexa scores instead of PHRED scores.
+
+First let's see what Biopython says if we convert the PHRED scores into Solexa
+scores (rounding to one decimal place):
+
+>>> for q in [0, 1, 2, 3, 4, 5, 10, 20, 30, 40]:
+...     print("PHRED %i maps to Solexa %0.1f" % (q, solexa_quality_from_phred(q)))
+PHRED 0 maps to Solexa -5.0
+PHRED 1 maps to Solexa -5.0
+PHRED 2 maps to Solexa -2.3
+PHRED 3 maps to Solexa -0.0
+PHRED 4 maps to Solexa 1.8
+PHRED 5 maps to Solexa 3.3
+PHRED 10 maps to Solexa 9.5
+PHRED 20 maps to Solexa 20.0
+PHRED 30 maps to Solexa 30.0
+PHRED 40 maps to Solexa 40.0
+
+Now here is the record using the old Solexa style FASTQ file:
+
+>>> print(test.format("fastq-solexa"))
+@Test Made up!
+NACGTACGTA
++
+;;>@BCJT^h
+
+
+Again, this is using an ASCII offset of 64, so we can check the Solexa scores:
+
+>>> [ord(letter)-64 for letter in ";;>@BCJT^h"]
+[-5, -5, -2, 0, 2, 3, 10, 20, 30, 40]
+
+This explains why the last few letters of this FASTQ output matched that using
+the Illumina 1.3 to 1.7 format - high quality PHRED scores and Solexa scores
+are approximately equal.
+
+"""
+import warnings
+
+from math import log
+
+from Bio import BiopythonParserWarning
+from Bio import BiopythonWarning
+from Bio import StreamModeError
+from Bio.File import as_handle
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import _clean
+from .Interfaces import _get_seq_string
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+# define score offsets. See discussion for differences between Sanger and
+# Solexa offsets.
+SANGER_SCORE_OFFSET = 33
+SOLEXA_SCORE_OFFSET = 64
+
+
+def solexa_quality_from_phred(phred_quality):
+    """Covert a PHRED quality (range 0 to about 90) to a Solexa quality.
+
+    PHRED and Solexa quality scores are both log transformations of a
+    probality of error (high score = low probability of error). This function
+    takes a PHRED score, transforms it back to a probability of error, and
+    then re-expresses it as a Solexa score. This assumes the error estimates
+    are equivalent.
+
+    How does this work exactly? Well the PHRED quality is minus ten times the
+    base ten logarithm of the probability of error::
+
+        phred_quality = -10*log(error,10)
+
+    Therefore, turning this round::
+
+        error = 10 ** (- phred_quality / 10)
+
+    Now, Solexa qualities use a different log transformation::
+
+        solexa_quality = -10*log(error/(1-error),10)
+
+    After substitution and a little manipulation we get::
+
+         solexa_quality = 10*log(10**(phred_quality/10.0) - 1, 10)
+
+    However, real Solexa files use a minimum quality of -5. This does have a
+    good reason - a random base call would be correct 25% of the time,
+    and thus have a probability of error of 0.75, which gives 1.25 as the PHRED
+    quality, or -4.77 as the Solexa quality. Thus (after rounding), a random
+    nucleotide read would have a PHRED quality of 1, or a Solexa quality of -5.
+
+    Taken literally, this logarithic formula would map a PHRED quality of zero
+    to a Solexa quality of minus infinity. Of course, taken literally, a PHRED
+    score of zero means a probability of error of one (i.e. the base call is
+    definitely wrong), which is worse than random! In practice, a PHRED quality
+    of zero usually means a default value, or perhaps random - and therefore
+    mapping it to the minimum Solexa score of -5 is reasonable.
+
+    In conclusion, we follow EMBOSS, and take this logarithmic formula but also
+    apply a minimum value of -5.0 for the Solexa quality, and also map a PHRED
+    quality of zero to -5.0 as well.
+
+    Note this function will return a floating point number, it is up to you to
+    round this to the nearest integer if appropriate.  e.g.
+
+    >>> print("%0.2f" % round(solexa_quality_from_phred(80), 2))
+    80.00
+    >>> print("%0.2f" % round(solexa_quality_from_phred(50), 2))
+    50.00
+    >>> print("%0.2f" % round(solexa_quality_from_phred(20), 2))
+    19.96
+    >>> print("%0.2f" % round(solexa_quality_from_phred(10), 2))
+    9.54
+    >>> print("%0.2f" % round(solexa_quality_from_phred(5), 2))
+    3.35
+    >>> print("%0.2f" % round(solexa_quality_from_phred(4), 2))
+    1.80
+    >>> print("%0.2f" % round(solexa_quality_from_phred(3), 2))
+    -0.02
+    >>> print("%0.2f" % round(solexa_quality_from_phred(2), 2))
+    -2.33
+    >>> print("%0.2f" % round(solexa_quality_from_phred(1), 2))
+    -5.00
+    >>> print("%0.2f" % round(solexa_quality_from_phred(0), 2))
+    -5.00
+
+    Notice that for high quality reads PHRED and Solexa scores are numerically
+    equal. The differences are important for poor quality reads, where PHRED
+    has a minimum of zero but Solexa scores can be negative.
+
+    Finally, as a special case where None is used for a "missing value", None
+    is returned:
+
+    >>> print(solexa_quality_from_phred(None))
+    None
+    """
+    if phred_quality is None:
+        # Assume None is used as some kind of NULL or NA value; return None
+        # e.g. Bio.SeqIO gives Ace contig gaps a quality of None.
+        return None
+    elif phred_quality > 0:
+        # Solexa uses a minimum value of -5, which after rounding matches a
+        # random nucleotide base call.
+        return max(-5.0, 10 * log(10 ** (phred_quality / 10.0) - 1, 10))
+    elif phred_quality == 0:
+        # Special case, map to -5 as discussed in the docstring
+        return -5.0
+    else:
+        raise ValueError(
+            "PHRED qualities must be positive (or zero), not %r" % phred_quality
+        )
+
+
+def phred_quality_from_solexa(solexa_quality):
+    """Convert a Solexa quality (which can be negative) to a PHRED quality.
+
+    PHRED and Solexa quality scores are both log transformations of a
+    probality of error (high score = low probability of error). This function
+    takes a Solexa score, transforms it back to a probability of error, and
+    then re-expresses it as a PHRED score. This assumes the error estimates
+    are equivalent.
+
+    The underlying formulas are given in the documentation for the sister
+    function solexa_quality_from_phred, in this case the operation is::
+
+        phred_quality = 10*log(10**(solexa_quality/10.0) + 1, 10)
+
+    This will return a floating point number, it is up to you to round this to
+    the nearest integer if appropriate.  e.g.
+
+    >>> print("%0.2f" % round(phred_quality_from_solexa(80), 2))
+    80.00
+    >>> print("%0.2f" % round(phred_quality_from_solexa(20), 2))
+    20.04
+    >>> print("%0.2f" % round(phred_quality_from_solexa(10), 2))
+    10.41
+    >>> print("%0.2f" % round(phred_quality_from_solexa(0), 2))
+    3.01
+    >>> print("%0.2f" % round(phred_quality_from_solexa(-5), 2))
+    1.19
+
+    Note that a solexa_quality less then -5 is not expected, will trigger a
+    warning, but will still be converted as per the logarithmic mapping
+    (giving a number between 0 and 1.19 back).
+
+    As a special case where None is used for a "missing value", None is
+    returned:
+
+    >>> print(phred_quality_from_solexa(None))
+    None
+    """
+    if solexa_quality is None:
+        # Assume None is used as some kind of NULL or NA value; return None
+        return None
+    if solexa_quality < -5:
+        warnings.warn(
+            "Solexa quality less than -5 passed, %r" % solexa_quality, BiopythonWarning
+        )
+    return 10 * log(10 ** (solexa_quality / 10.0) + 1, 10)
+
+
+def _get_phred_quality(record):
+    """Extract PHRED qualities from a SeqRecord's letter_annotations (PRIVATE).
+
+    If there are no PHRED qualities, but there are Solexa qualities, those are
+    used instead after conversion.
+    """
+    try:
+        return record.letter_annotations["phred_quality"]
+    except KeyError:
+        pass
+    try:
+        return [
+            phred_quality_from_solexa(q)
+            for q in record.letter_annotations["solexa_quality"]
+        ]
+    except KeyError:
+        raise ValueError(
+            "No suitable quality scores found in "
+            "letter_annotations of SeqRecord (id=%s)." % record.id
+        ) from None
+
+
+# Only map 0 to 93, we need to give a warning on truncating at 93
+_phred_to_sanger_quality_str = {
+    qp: chr(min(126, qp + SANGER_SCORE_OFFSET)) for qp in range(0, 93 + 1)
+}
+# Only map -5 to 93, we need to give a warning on truncating at 93
+_solexa_to_sanger_quality_str = {
+    qs: chr(min(126, int(round(phred_quality_from_solexa(qs)) + SANGER_SCORE_OFFSET)))
+    for qs in range(-5, 93 + 1)
+}
+
+
+def _get_sanger_quality_str(record):
+    """Return a Sanger FASTQ encoded quality string (PRIVATE).
+
+    >>> from Bio.Seq import Seq
+    >>> from Bio.SeqRecord import SeqRecord
+    >>> r = SeqRecord(Seq("ACGTAN"), id="Test",
+    ...               letter_annotations = {"phred_quality":[50, 40, 30, 20, 10, 0]})
+    >>> _get_sanger_quality_str(r)
+    'SI?5+!'
+
+    If as in the above example (or indeed a SeqRecord parser with Bio.SeqIO),
+    the PHRED qualities are integers, this function is able to use a very fast
+    pre-cached mapping. However, if they are floats which differ slightly, then
+    it has to do the appropriate rounding - which is slower:
+
+    >>> r2 = SeqRecord(Seq("ACGTAN"), id="Test2",
+    ...      letter_annotations = {"phred_quality":[50.0, 40.05, 29.99, 20, 9.55, 0.01]})
+    >>> _get_sanger_quality_str(r2)
+    'SI?5+!'
+
+    If your scores include a None value, this raises an exception:
+
+    >>> r3 = SeqRecord(Seq("ACGTAN"), id="Test3",
+    ...               letter_annotations = {"phred_quality":[50, 40, 30, 20, 10, None]})
+    >>> _get_sanger_quality_str(r3)
+    Traceback (most recent call last):
+       ...
+    TypeError: A quality value of None was found
+
+    If (strangely) your record has both PHRED and Solexa scores, then the PHRED
+    scores are used in preference:
+
+    >>> r4 = SeqRecord(Seq("ACGTAN"), id="Test4",
+    ...               letter_annotations = {"phred_quality":[50, 40, 30, 20, 10, 0],
+    ...                                     "solexa_quality":[-5, -4, 0, None, 0, 40]})
+    >>> _get_sanger_quality_str(r4)
+    'SI?5+!'
+
+    If there are no PHRED scores, but there are Solexa scores, these are used
+    instead (after the appropriate conversion):
+
+    >>> r5 = SeqRecord(Seq("ACGTAN"), id="Test5",
+    ...      letter_annotations = {"solexa_quality":[40, 30, 20, 10, 0, -5]})
+    >>> _get_sanger_quality_str(r5)
+    'I?5+$"'
+
+    Again, integer Solexa scores can be looked up in a pre-cached mapping making
+    this very fast. You can still use approximate floating point scores:
+
+    >>> r6 = SeqRecord(Seq("ACGTAN"), id="Test6",
+    ...      letter_annotations = {"solexa_quality":[40.1, 29.7, 20.01, 10, 0.0, -4.9]})
+    >>> _get_sanger_quality_str(r6)
+    'I?5+$"'
+
+    Notice that due to the limited range of printable ASCII characters, a
+    PHRED quality of 93 is the maximum that can be held in an Illumina FASTQ
+    file (using ASCII 126, the tilde). This function will issue a warning
+    in this situation.
+    """
+    # TODO - This functions works and is fast, but it is also ugly
+    # and there is considerable repetition of code for the other
+    # two FASTQ variants.
+    try:
+        # These take priority (in case both Solexa and PHRED scores found)
+        qualities = record.letter_annotations["phred_quality"]
+    except KeyError:
+        # Fall back on solexa scores...
+        pass
+    else:
+        # Try and use the precomputed mapping:
+        try:
+            return "".join(_phred_to_sanger_quality_str[qp] for qp in qualities)
+        except KeyError:
+            # Could be a float, or a None in the list, or a high value.
+            pass
+        if None in qualities:
+            raise TypeError("A quality value of None was found")
+        if max(qualities) >= 93.5:
+            warnings.warn(
+                "Data loss - max PHRED quality 93 in Sanger FASTQ", BiopythonWarning
+            )
+        # This will apply the truncation at 93, giving max ASCII 126
+        return "".join(
+            chr(min(126, int(round(qp)) + SANGER_SCORE_OFFSET)) for qp in qualities
+        )
+    # Fall back on the Solexa scores...
+    try:
+        qualities = record.letter_annotations["solexa_quality"]
+    except KeyError:
+        raise ValueError(
+            "No suitable quality scores found in "
+            "letter_annotations of SeqRecord (id=%s)." % record.id
+        ) from None
+    # Try and use the precomputed mapping:
+    try:
+        return "".join(_solexa_to_sanger_quality_str[qs] for qs in qualities)
+    except KeyError:
+        # Either no PHRED scores, or something odd like a float or None
+        pass
+    if None in qualities:
+        raise TypeError("A quality value of None was found")
+    # Must do this the slow way, first converting the PHRED scores into
+    # Solexa scores:
+    if max(qualities) >= 93.5:
+        warnings.warn(
+            "Data loss - max PHRED quality 93 in Sanger FASTQ", BiopythonWarning
+        )
+    # This will apply the truncation at 93, giving max ASCII 126
+    return "".join(
+        chr(min(126, int(round(phred_quality_from_solexa(qs))) + SANGER_SCORE_OFFSET))
+        for qs in qualities
+    )
+
+
+# Only map 0 to 62, we need to give a warning on truncating at 62
+assert 62 + SOLEXA_SCORE_OFFSET == 126
+_phred_to_illumina_quality_str = {
+    qp: chr(qp + SOLEXA_SCORE_OFFSET) for qp in range(0, 62 + 1)
+}
+# Only map -5 to 62, we need to give a warning on truncating at 62
+_solexa_to_illumina_quality_str = {
+    qs: chr(int(round(phred_quality_from_solexa(qs))) + SOLEXA_SCORE_OFFSET)
+    for qs in range(-5, 62 + 1)
+}
+
+
+def _get_illumina_quality_str(record):
+    """Return an Illumina 1.3 to 1.7 FASTQ encoded quality string (PRIVATE).
+
+    Notice that due to the limited range of printable ASCII characters, a
+    PHRED quality of 62 is the maximum that can be held in an Illumina FASTQ
+    file (using ASCII 126, the tilde). This function will issue a warning
+    in this situation.
+    """
+    # TODO - This functions works and is fast, but it is also ugly
+    # and there is considerable repetition of code for the other
+    # two FASTQ variants.
+    try:
+        # These take priority (in case both Solexa and PHRED scores found)
+        qualities = record.letter_annotations["phred_quality"]
+    except KeyError:
+        # Fall back on solexa scores...
+        pass
+    else:
+        # Try and use the precomputed mapping:
+        try:
+            return "".join(_phred_to_illumina_quality_str[qp] for qp in qualities)
+        except KeyError:
+            # Could be a float, or a None in the list, or a high value.
+            pass
+        if None in qualities:
+            raise TypeError("A quality value of None was found")
+        if max(qualities) >= 62.5:
+            warnings.warn(
+                "Data loss - max PHRED quality 62 in Illumina FASTQ", BiopythonWarning
+            )
+        # This will apply the truncation at 62, giving max ASCII 126
+        return "".join(
+            chr(min(126, int(round(qp)) + SOLEXA_SCORE_OFFSET)) for qp in qualities
+        )
+    # Fall back on the Solexa scores...
+    try:
+        qualities = record.letter_annotations["solexa_quality"]
+    except KeyError:
+        raise ValueError(
+            "No suitable quality scores found in "
+            "letter_annotations of SeqRecord (id=%s)." % record.id
+        ) from None
+    # Try and use the precomputed mapping:
+    try:
+        return "".join(_solexa_to_illumina_quality_str[qs] for qs in qualities)
+    except KeyError:
+        # Either no PHRED scores, or something odd like a float or None
+        pass
+    if None in qualities:
+        raise TypeError("A quality value of None was found")
+    # Must do this the slow way, first converting the PHRED scores into
+    # Solexa scores:
+    if max(qualities) >= 62.5:
+        warnings.warn(
+            "Data loss - max PHRED quality 62 in Illumina FASTQ", BiopythonWarning
+        )
+    # This will apply the truncation at 62, giving max ASCII 126
+    return "".join(
+        chr(min(126, int(round(phred_quality_from_solexa(qs))) + SOLEXA_SCORE_OFFSET))
+        for qs in qualities
+    )
+
+
+# Only map 0 to 62, we need to give a warning on truncating at 62
+assert 62 + SOLEXA_SCORE_OFFSET == 126
+_solexa_to_solexa_quality_str = {
+    qs: chr(min(126, qs + SOLEXA_SCORE_OFFSET)) for qs in range(-5, 62 + 1)
+}
+# Only map -5 to 62, we need to give a warning on truncating at 62
+_phred_to_solexa_quality_str = {
+    qp: chr(min(126, int(round(solexa_quality_from_phred(qp))) + SOLEXA_SCORE_OFFSET))
+    for qp in range(0, 62 + 1)
+}
+
+
+def _get_solexa_quality_str(record):
+    """Return a Solexa FASTQ encoded quality string (PRIVATE).
+
+    Notice that due to the limited range of printable ASCII characters, a
+    Solexa quality of 62 is the maximum that can be held in a Solexa FASTQ
+    file (using ASCII 126, the tilde). This function will issue a warning
+    in this situation.
+    """
+    # TODO - This functions works and is fast, but it is also ugly
+    # and there is considerable repetition of code for the other
+    # two FASTQ variants.
+    try:
+        # These take priority (in case both Solexa and PHRED scores found)
+        qualities = record.letter_annotations["solexa_quality"]
+    except KeyError:
+        # Fall back on PHRED scores...
+        pass
+    else:
+        # Try and use the precomputed mapping:
+        try:
+            return "".join(_solexa_to_solexa_quality_str[qs] for qs in qualities)
+        except KeyError:
+            # Could be a float, or a None in the list, or a high value.
+            pass
+        if None in qualities:
+            raise TypeError("A quality value of None was found")
+        if max(qualities) >= 62.5:
+            warnings.warn(
+                "Data loss - max Solexa quality 62 in Solexa FASTQ", BiopythonWarning
+            )
+        # This will apply the truncation at 62, giving max ASCII 126
+        return "".join(
+            chr(min(126, int(round(qs)) + SOLEXA_SCORE_OFFSET)) for qs in qualities
+        )
+    # Fall back on the PHRED scores...
+    try:
+        qualities = record.letter_annotations["phred_quality"]
+    except KeyError:
+        raise ValueError(
+            "No suitable quality scores found in "
+            "letter_annotations of SeqRecord (id=%s)." % record.id
+        ) from None
+    # Try and use the precomputed mapping:
+    try:
+        return "".join(_phred_to_solexa_quality_str[qp] for qp in qualities)
+    except KeyError:
+        # Either no PHRED scores, or something odd like a float or None
+        # or too big to be in the cache
+        pass
+    if None in qualities:
+        raise TypeError("A quality value of None was found")
+    # Must do this the slow way, first converting the PHRED scores into
+    # Solexa scores:
+    if max(qualities) >= 62.5:
+        warnings.warn(
+            "Data loss - max Solexa quality 62 in Solexa FASTQ", BiopythonWarning
+        )
+    return "".join(
+        chr(min(126, int(round(solexa_quality_from_phred(qp))) + SOLEXA_SCORE_OFFSET))
+        for qp in qualities
+    )
+
+
+# TODO - Default to nucleotide or even DNA?
+def FastqGeneralIterator(source):
+    """Iterate over Fastq records as string tuples (not as SeqRecord objects).
+
+    Arguments:
+     - source - input stream opened in text mode, or a path to a file
+
+    This code does not try to interpret the quality string numerically.  It
+    just returns tuples of the title, sequence and quality as strings.  For
+    the sequence and quality, any whitespace (such as new lines) is removed.
+
+    Our SeqRecord based FASTQ iterators call this function internally, and then
+    turn the strings into a SeqRecord objects, mapping the quality string into
+    a list of numerical scores.  If you want to do a custom quality mapping,
+    then you might consider calling this function directly.
+
+    For parsing FASTQ files, the title string from the "@" line at the start
+    of each record can optionally be omitted on the "+" lines.  If it is
+    repeated, it must be identical.
+
+    The sequence string and the quality string can optionally be split over
+    multiple lines, although several sources discourage this.  In comparison,
+    for the FASTA file format line breaks between 60 and 80 characters are
+    the norm.
+
+    **WARNING** - Because the "@" character can appear in the quality string,
+    this can cause problems as this is also the marker for the start of
+    a new sequence.  In fact, the "+" sign can also appear as well.  Some
+    sources recommended having no line breaks in the  quality to avoid this,
+    but even that is not enough, consider this example::
+
+        @071113_EAS56_0053:1:1:998:236
+        TTTCTTGCCCCCATAGACTGAGACCTTCCCTAAATA
+        +071113_EAS56_0053:1:1:998:236
+        IIIIIIIIIIIIIIIIIIIIIIIIIIIIICII+III
+        @071113_EAS56_0053:1:1:182:712
+        ACCCAGCTAATTTTTGTATTTTTGTTAGAGACAGTG
+        +
+        @IIIIIIIIIIIIIIICDIIIII<%<6&-*).(*%+
+        @071113_EAS56_0053:1:1:153:10
+        TGTTCTGAAGGAAGGTGTGCGTGCGTGTGTGTGTGT
+        +
+        IIIIIIIIIIIICIIGIIIII>IAIIIE65I=II:6
+        @071113_EAS56_0053:1:3:990:501
+        TGGGAGGTTTTATGTGGA
+        AAGCAGCAATGTACAAGA
+        +
+        IIIIIII.IIIIII1@44
+        @-7.%<&+/$/%4(++(%
+
+    This is four PHRED encoded FASTQ entries originally from an NCBI source
+    (given the read length of 36, these are probably Solexa Illumina reads where
+    the quality has been mapped onto the PHRED values).
+
+    This example has been edited to illustrate some of the nasty things allowed
+    in the FASTQ format.  Firstly, on the "+" lines most but not all of the
+    (redundant) identifiers are omitted.  In real files it is likely that all or
+    none of these extra identifiers will be present.
+
+    Secondly, while the first three sequences have been shown without line
+    breaks, the last has been split over multiple lines.  In real files any line
+    breaks are likely to be consistent.
+
+    Thirdly, some of the quality string lines start with an "@" character.  For
+    the second record this is unavoidable.  However for the fourth sequence this
+    only happens because its quality string is split over two lines.  A naive
+    parser could wrongly treat any line starting with an "@" as the beginning of
+    a new sequence!  This code copes with this possible ambiguity by keeping
+    track of the length of the sequence which gives the expected length of the
+    quality string.
+
+    Using this tricky example file as input, this short bit of code demonstrates
+    what this parsing function would return:
+
+    >>> with open("Quality/tricky.fastq") as handle:
+    ...     for (title, sequence, quality) in FastqGeneralIterator(handle):
+    ...         print(title)
+    ...         print("%s %s" % (sequence, quality))
+    ...
+    071113_EAS56_0053:1:1:998:236
+    TTTCTTGCCCCCATAGACTGAGACCTTCCCTAAATA IIIIIIIIIIIIIIIIIIIIIIIIIIIIICII+III
+    071113_EAS56_0053:1:1:182:712
+    ACCCAGCTAATTTTTGTATTTTTGTTAGAGACAGTG @IIIIIIIIIIIIIIICDIIIII<%<6&-*).(*%+
+    071113_EAS56_0053:1:1:153:10
+    TGTTCTGAAGGAAGGTGTGCGTGCGTGTGTGTGTGT IIIIIIIIIIIICIIGIIIII>IAIIIE65I=II:6
+    071113_EAS56_0053:1:3:990:501
+    TGGGAGGTTTTATGTGGAAAGCAGCAATGTACAAGA IIIIIII.IIIIII1@44@-7.%<&+/$/%4(++(%
+
+    Finally we note that some sources state that the quality string should
+    start with "!" (which using the PHRED mapping means the first letter always
+    has a quality score of zero).  This rather restrictive rule is not widely
+    observed, so is therefore ignored here.  One plus point about this "!" rule
+    is that (provided there are no line breaks in the quality sequence) it
+    would prevent the above problem with the "@" character.
+    """
+    try:
+        handle = open(source)
+    except TypeError:
+        handle = source
+        if handle.read(0) != "":
+            raise StreamModeError("Fastq files must be opened in text mode") from None
+    try:
+        try:
+            line = next(handle)
+        except StopIteration:
+            return  # Premature end of file, or just empty?
+
+        while True:
+            if line[0] != "@":
+                raise ValueError(
+                    "Records in Fastq files should start with '@' character"
+                )
+            title_line = line[1:].rstrip()
+            seq_string = ""
+            # There will now be one or more sequence lines; keep going until we
+            # find the "+" marking the quality line:
+            for line in handle:
+                if line[0] == "+":
+                    break
+                seq_string += line.rstrip()
+            else:
+                if seq_string:
+                    raise ValueError("End of file without quality information.")
+                else:
+                    raise ValueError("Unexpected end of file")
+            # The title here is optional, but if present must match!
+            second_title = line[1:].rstrip()
+            if second_title and second_title != title_line:
+                raise ValueError("Sequence and quality captions differ.")
+            # This is going to slow things down a little, but assuming
+            # this isn't allowed we should try and catch it here:
+            if " " in seq_string or "\t" in seq_string:
+                raise ValueError("Whitespace is not allowed in the sequence.")
+            seq_len = len(seq_string)
+
+            # There will now be at least one line of quality data, followed by
+            # another sequence, or EOF
+            line = None
+            quality_string = ""
+            for line in handle:
+                if line[0] == "@":
+                    # This COULD be the start of a new sequence. However, it MAY just
+                    # be a line of quality data which starts with a "@" character.  We
+                    # should be able to check this by looking at the sequence length
+                    # and the amount of quality data found so far.
+                    if len(quality_string) >= seq_len:
+                        # We expect it to be equal if this is the start of a new record.
+                        # If the quality data is longer, we'll raise an error below.
+                        break
+                    # Continue - its just some (more) quality data.
+                quality_string += line.rstrip()
+            else:
+                if line is None:
+                    raise ValueError("Unexpected end of file")
+                line = None
+
+            if seq_len != len(quality_string):
+                raise ValueError(
+                    "Lengths of sequence and quality values differs for %s (%i and %i)."
+                    % (title_line, seq_len, len(quality_string))
+                )
+
+            # Return the record and then continue...
+            yield (title_line, seq_string, quality_string)
+
+            if line is None:
+                break
+    finally:
+        if handle is not source:
+            handle.close()
+
+
+class FastqPhredIterator(SequenceIterator):
+    """Parser for FASTQ files."""
+
+    def __init__(self, source, alphabet=None, title2ids=None):
+        """Iterate over FASTQ records as SeqRecord objects.
+
+        Arguments:
+         - source - input stream opened in text mode, or a path to a file
+         - alphabet - optional alphabet, no longer used. Leave as None.
+         - title2ids - A function that, when given the title line from the FASTQ
+           file (without the beginning >), will return the id, name and
+           description (in that order) for the record as a tuple of strings.
+           If this is not given, then the entire title line will be used as
+           the description, and the first word as the id and name.
+
+        Note that use of title2ids matches that of Bio.SeqIO.FastaIO.
+
+        For each sequence in a (Sanger style) FASTQ file there is a matching string
+        encoding the PHRED qualities (integers between 0 and about 90) using ASCII
+        values with an offset of 33.
+
+        For example, consider a file containing three short reads::
+
+            @EAS54_6_R1_2_1_413_324
+            CCCTTCTTGTCTTCAGCGTTTCTCC
+            +
+            ;;3;;;;;;;;;;;;7;;;;;;;88
+            @EAS54_6_R1_2_1_540_792
+            TTGGCAGGCCAAGGCCGATGGATCA
+            +
+            ;;;;;;;;;;;7;;;;;-;;;3;83
+            @EAS54_6_R1_2_1_443_348
+            GTTGCTTCTGGCGTGGGTGGGGGGG
+            +
+            ;;;;;;;;;;;9;7;;.7;393333
+
+        For each sequence (e.g. "CCCTTCTTGTCTTCAGCGTTTCTCC") there is a matching
+        string encoding the PHRED qualities using a ASCII values with an offset of
+        33 (e.g. ";;3;;;;;;;;;;;;7;;;;;;;88").
+
+        Using this module directly you might run:
+
+        >>> with open("Quality/example.fastq") as handle:
+        ...     for record in FastqPhredIterator(handle):
+        ...         print("%s %s" % (record.id, record.seq))
+        EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
+        EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
+        EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG
+
+        Typically however, you would call this via Bio.SeqIO instead with "fastq"
+        (or "fastq-sanger") as the format:
+
+        >>> from Bio import SeqIO
+        >>> with open("Quality/example.fastq") as handle:
+        ...     for record in SeqIO.parse(handle, "fastq"):
+        ...         print("%s %s" % (record.id, record.seq))
+        EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
+        EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
+        EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG
+
+        If you want to look at the qualities, they are record in each record's
+        per-letter-annotation dictionary as a simple list of integers:
+
+        >>> print(record.letter_annotations["phred_quality"])
+        [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        self.title2ids = title2ids
+        super().__init__(source, mode="t", fmt="Fastq")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        title2ids = self.title2ids
+        assert SANGER_SCORE_OFFSET == ord("!")
+        # Originally, I used a list expression for each record:
+        #
+        # qualities = [ord(letter)-SANGER_SCORE_OFFSET for letter in quality_string]
+        #
+        # Precomputing is faster, perhaps partly by avoiding the subtractions.
+        q_mapping = {
+            chr(letter): letter - SANGER_SCORE_OFFSET
+            for letter in range(SANGER_SCORE_OFFSET, 94 + SANGER_SCORE_OFFSET)
+        }
+
+        for title_line, seq_string, quality_string in FastqGeneralIterator(handle):
+            if title2ids:
+                id, name, descr = title2ids(title_line)
+            else:
+                descr = title_line
+                id = descr.split()[0]
+                name = id
+            record = SeqRecord(Seq(seq_string), id=id, name=name, description=descr)
+            try:
+                qualities = [q_mapping[letter] for letter in quality_string]
+            except KeyError:
+                raise ValueError("Invalid character in quality string") from None
+            # For speed, will now use a dirty trick to speed up assigning the
+            # qualities. We do this to bypass the length check imposed by the
+            # per-letter-annotations restricted dict (as this has already been
+            # checked by FastqGeneralIterator). This is equivalent to:
+            # record.letter_annotations["phred_quality"] = qualities
+            dict.__setitem__(record._per_letter_annotations, "phred_quality", qualities)
+            yield record
+
+
+def FastqSolexaIterator(source, alphabet=None, title2ids=None):
+    r"""Parse old Solexa/Illumina FASTQ like files (which differ in the quality mapping).
+
+    The optional arguments are the same as those for the FastqPhredIterator.
+
+    For each sequence in Solexa/Illumina FASTQ files there is a matching string
+    encoding the Solexa integer qualities using ASCII values with an offset
+    of 64.  Solexa scores are scaled differently to PHRED scores, and Biopython
+    will NOT perform any automatic conversion when loading.
+
+    NOTE - This file format is used by the OLD versions of the Solexa/Illumina
+    pipeline. See also the FastqIlluminaIterator function for the NEW version.
+
+    For example, consider a file containing these five records::
+
+        @SLXA-B3_649_FC8437_R1_1_1_610_79
+        GATGTGCAATACCTTTGTAGAGGAA
+        +SLXA-B3_649_FC8437_R1_1_1_610_79
+        YYYYYYYYYYYYYYYYYYWYWYYSU
+        @SLXA-B3_649_FC8437_R1_1_1_397_389
+        GGTTTGAGAAAGAGAAATGAGATAA
+        +SLXA-B3_649_FC8437_R1_1_1_397_389
+        YYYYYYYYYWYYYYWWYYYWYWYWW
+        @SLXA-B3_649_FC8437_R1_1_1_850_123
+        GAGGGTGTTGATCATGATGATGGCG
+        +SLXA-B3_649_FC8437_R1_1_1_850_123
+        YYYYYYYYYYYYYWYYWYYSYYYSY
+        @SLXA-B3_649_FC8437_R1_1_1_362_549
+        GGAAACAAAGTTTTTCTCAACATAG
+        +SLXA-B3_649_FC8437_R1_1_1_362_549
+        YYYYYYYYYYYYYYYYYYWWWWYWY
+        @SLXA-B3_649_FC8437_R1_1_1_183_714
+        GTATTATTTAATGGCATACACTCAA
+        +SLXA-B3_649_FC8437_R1_1_1_183_714
+        YYYYYYYYYYWYYYYWYWWUWWWQQ
+
+    Using this module directly you might run:
+
+    >>> with open("Quality/solexa_example.fastq") as handle:
+    ...     for record in FastqSolexaIterator(handle):
+    ...         print("%s %s" % (record.id, record.seq))
+    SLXA-B3_649_FC8437_R1_1_1_610_79 GATGTGCAATACCTTTGTAGAGGAA
+    SLXA-B3_649_FC8437_R1_1_1_397_389 GGTTTGAGAAAGAGAAATGAGATAA
+    SLXA-B3_649_FC8437_R1_1_1_850_123 GAGGGTGTTGATCATGATGATGGCG
+    SLXA-B3_649_FC8437_R1_1_1_362_549 GGAAACAAAGTTTTTCTCAACATAG
+    SLXA-B3_649_FC8437_R1_1_1_183_714 GTATTATTTAATGGCATACACTCAA
+
+    Typically however, you would call this via Bio.SeqIO instead with
+    "fastq-solexa" as the format:
+
+    >>> from Bio import SeqIO
+    >>> with open("Quality/solexa_example.fastq") as handle:
+    ...     for record in SeqIO.parse(handle, "fastq-solexa"):
+    ...         print("%s %s" % (record.id, record.seq))
+    SLXA-B3_649_FC8437_R1_1_1_610_79 GATGTGCAATACCTTTGTAGAGGAA
+    SLXA-B3_649_FC8437_R1_1_1_397_389 GGTTTGAGAAAGAGAAATGAGATAA
+    SLXA-B3_649_FC8437_R1_1_1_850_123 GAGGGTGTTGATCATGATGATGGCG
+    SLXA-B3_649_FC8437_R1_1_1_362_549 GGAAACAAAGTTTTTCTCAACATAG
+    SLXA-B3_649_FC8437_R1_1_1_183_714 GTATTATTTAATGGCATACACTCAA
+
+    If you want to look at the qualities, they are recorded in each record's
+    per-letter-annotation dictionary as a simple list of integers:
+
+    >>> print(record.letter_annotations["solexa_quality"])
+    [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 23, 25, 25, 25, 25, 23, 25, 23, 23, 21, 23, 23, 23, 17, 17]
+
+    These scores aren't very good, but they are high enough that they map
+    almost exactly onto PHRED scores:
+
+    >>> print("%0.2f" % phred_quality_from_solexa(25))
+    25.01
+
+    Let's look at faked example read which is even worse, where there are
+    more noticeable differences between the Solexa and PHRED scores::
+
+         @slxa_0001_1_0001_01
+         ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+         +slxa_0001_1_0001_01
+         hgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;
+
+    Again, you would typically use Bio.SeqIO to read this file in (rather than
+    calling the Bio.SeqIO.QualtityIO module directly).  Most FASTQ files will
+    contain thousands of reads, so you would normally use Bio.SeqIO.parse()
+    as shown above.  This example has only as one entry, so instead we can
+    use the Bio.SeqIO.read() function:
+
+    >>> from Bio import SeqIO
+    >>> with open("Quality/solexa_faked.fastq") as handle:
+    ...     record = SeqIO.read(handle, "fastq-solexa")
+    >>> print("%s %s" % (record.id, record.seq))
+    slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+    >>> print(record.letter_annotations["solexa_quality"])
+    [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
+
+    These quality scores are so low that when converted from the Solexa scheme
+    into PHRED scores they look quite different:
+
+    >>> print("%0.2f" % phred_quality_from_solexa(-1))
+    2.54
+    >>> print("%0.2f" % phred_quality_from_solexa(-5))
+    1.19
+
+    Note you can use the Bio.SeqIO.write() function or the SeqRecord's format
+    method to output the record(s):
+
+    >>> print(record.format("fastq-solexa"))
+    @slxa_0001_1_0001_01
+    ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+    +
+    hgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;
+    
+
+    Note this output is slightly different from the input file as Biopython
+    has left out the optional repetition of the sequence identifier on the "+"
+    line.  If you want the to use PHRED scores, use "fastq" or "qual" as the
+    output format instead, and Biopython will do the conversion for you:
+
+    >>> print(record.format("fastq"))
+    @slxa_0001_1_0001_01
+    ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+    +
+    IHGFEDCBA@?>=<;:9876543210/.-,++*)('&&%%$$##""
+    
+
+    >>> print(record.format("qual"))
+    >slxa_0001_1_0001_01
+    40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21
+    20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2
+    1 1
+    
+
+    As shown above, the poor quality Solexa reads have been mapped to the
+    equivalent PHRED score (e.g. -5 to 1 as shown earlier).
+    """
+    if alphabet is not None:
+        raise ValueError("The alphabet argument is no longer supported")
+
+    q_mapping = {
+        chr(letter): letter - SOLEXA_SCORE_OFFSET
+        for letter in range(SOLEXA_SCORE_OFFSET - 5, 63 + SOLEXA_SCORE_OFFSET)
+    }
+
+    for title_line, seq_string, quality_string in FastqGeneralIterator(source):
+        if title2ids:
+            id, name, descr = title2ids(title_line)
+        else:
+            descr = title_line
+            id = descr.split()[0]
+            name = id
+        record = SeqRecord(Seq(seq_string), id=id, name=name, description=descr)
+        try:
+            qualities = [q_mapping[letter] for letter in quality_string]
+        # DO NOT convert these into PHRED qualities automatically!
+        except KeyError:
+            raise ValueError("Invalid character in quality string") from None
+        # Dirty trick to speed up this line:
+        # record.letter_annotations["solexa_quality"] = qualities
+        dict.__setitem__(record._per_letter_annotations, "solexa_quality", qualities)
+        yield record
+
+
+def FastqIlluminaIterator(source, alphabet=None, title2ids=None):
+    """Parse Illumina 1.3 to 1.7 FASTQ like files (which differ in the quality mapping).
+
+    The optional arguments are the same as those for the FastqPhredIterator.
+
+    For each sequence in Illumina 1.3+ FASTQ files there is a matching string
+    encoding PHRED integer qualities using ASCII values with an offset of 64.
+
+    >>> from Bio import SeqIO
+    >>> record = SeqIO.read("Quality/illumina_faked.fastq", "fastq-illumina")
+    >>> print("%s %s" % (record.id, record.seq))
+    Test ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN
+    >>> max(record.letter_annotations["phred_quality"])
+    40
+    >>> min(record.letter_annotations["phred_quality"])
+    0
+
+    NOTE - Older versions of the Solexa/Illumina pipeline encoded Solexa scores
+    with an ASCII offset of 64. They are approximately equal but only for high
+    quality reads. If you have an old Solexa/Illumina file with negative
+    Solexa scores, and try and read this as an Illumina 1.3+ file it will fail:
+
+    >>> record2 = SeqIO.read("Quality/solexa_faked.fastq", "fastq-illumina")
+    Traceback (most recent call last):
+       ...
+    ValueError: Invalid character in quality string
+
+    NOTE - True Sanger style FASTQ files use PHRED scores with an offset of 33.
+    """
+    if alphabet is not None:
+        raise ValueError("The alphabet argument is no longer supported")
+
+    q_mapping = {
+        chr(letter): letter - SOLEXA_SCORE_OFFSET
+        for letter in range(SOLEXA_SCORE_OFFSET, 63 + SOLEXA_SCORE_OFFSET)
+    }
+
+    for title_line, seq_string, quality_string in FastqGeneralIterator(source):
+        if title2ids:
+            id, name, descr = title2ids(title_line)
+        else:
+            descr = title_line
+            id = descr.split()[0]
+            name = id
+        record = SeqRecord(Seq(seq_string), id=id, name=name, description=descr)
+        try:
+            qualities = [q_mapping[letter] for letter in quality_string]
+        except KeyError:
+            raise ValueError("Invalid character in quality string") from None
+        # Dirty trick to speed up this line:
+        # record.letter_annotations["phred_quality"] = qualities
+        dict.__setitem__(record._per_letter_annotations, "phred_quality", qualities)
+        yield record
+
+
+class QualPhredIterator(SequenceIterator):
+    """Parser for QUAL files with PHRED quality scores but no sequence."""
+
+    def __init__(self, source, alphabet=None, title2ids=None):
+        """For QUAL files which include PHRED quality scores, but no sequence.
+
+        For example, consider this short QUAL file::
+
+            >EAS54_6_R1_2_1_413_324
+            26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26
+            26 26 26 23 23
+            >EAS54_6_R1_2_1_540_792
+            26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26
+            26 18 26 23 18
+            >EAS54_6_R1_2_1_443_348
+            26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18
+            24 18 18 18 18
+
+        Using this module directly you might run:
+
+        >>> with open("Quality/example.qual") as handle:
+        ...     for record in QualPhredIterator(handle):
+        ...         print("%s read of length %d" % (record.id, len(record.seq)))
+        EAS54_6_R1_2_1_413_324 read of length 25
+        EAS54_6_R1_2_1_540_792 read of length 25
+        EAS54_6_R1_2_1_443_348 read of length 25
+
+        Typically however, you would call this via Bio.SeqIO instead with "qual"
+        as the format:
+
+        >>> from Bio import SeqIO
+        >>> with open("Quality/example.qual") as handle:
+        ...     for record in SeqIO.parse(handle, "qual"):
+        ...         print("%s read of length %d" % (record.id, len(record.seq)))
+        EAS54_6_R1_2_1_413_324 read of length 25
+        EAS54_6_R1_2_1_540_792 read of length 25
+        EAS54_6_R1_2_1_443_348 read of length 25
+
+        Only the sequence length is known, as the QUAL file does not contain
+        the sequence string itself.
+
+        The quality scores themselves are available as a list of integers
+        in each record's per-letter-annotation:
+
+        >>> print(record.letter_annotations["phred_quality"])
+        [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+        You can still slice one of these SeqRecord objects:
+
+        >>> sub_record = record[5:10]
+        >>> print("%s %s" % (sub_record.id, sub_record.letter_annotations["phred_quality"]))
+        EAS54_6_R1_2_1_443_348 [26, 26, 26, 26, 26]
+
+        As of Biopython 1.59, this parser will accept files with negatives quality
+        scores but will replace them with the lowest possible PHRED score of zero.
+        This will trigger a warning, previously it raised a ValueError exception.
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        self.title2ids = title2ids
+        super().__init__(source, mode="t", fmt="QUAL")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        title2ids = self.title2ids
+        # Skip any text before the first record (e.g. blank lines, comments)
+        for line in handle:
+            if line[0] == ">":
+                break
+        else:
+            return
+
+        while True:
+            if line[0] != ">":
+                raise ValueError(
+                    "Records in Fasta files should start with '>' character"
+                )
+            if title2ids:
+                id, name, descr = title2ids(line[1:].rstrip())
+            else:
+                descr = line[1:].rstrip()
+                id = descr.split()[0]
+                name = id
+
+            qualities = []
+            for line in handle:
+                if line[0] == ">":
+                    break
+                qualities.extend(int(word) for word in line.split())
+            else:
+                line = None
+
+            if qualities and min(qualities) < 0:
+                warnings.warn(
+                    "Negative quality score %i found, substituting PHRED zero instead."
+                    % min(qualities),
+                    BiopythonParserWarning,
+                )
+                qualities = [max(0, q) for q in qualities]
+
+            # Return the record and then continue...
+            sequence = Seq(None, length=len(qualities))
+            record = SeqRecord(sequence, id=id, name=name, description=descr)
+            # Dirty trick to speed up this line:
+            # record.letter_annotations["phred_quality"] = qualities
+            dict.__setitem__(record._per_letter_annotations, "phred_quality", qualities)
+            yield record
+
+            if line is None:
+                return  # StopIteration
+        raise ValueError("Unrecognised QUAL record format.")
+
+
+class FastqPhredWriter(SequenceWriter):
+    """Class to write standard FASTQ format files (using PHRED quality scores) (OBSOLETE).
+
+    Although you can use this class directly, you are strongly encouraged
+    to use the ``as_fastq`` function, or top level ``Bio.SeqIO.write()``
+    function instead via the format name "fastq" or the alias "fastq-sanger".
+
+    For example, this code reads in a standard Sanger style FASTQ file
+    (using PHRED scores) and re-saves it as another Sanger style FASTQ file:
+
+    >>> from Bio import SeqIO
+    >>> record_iterator = SeqIO.parse("Quality/example.fastq", "fastq")
+    >>> with open("Quality/temp.fastq", "w") as out_handle:
+    ...     SeqIO.write(record_iterator, out_handle, "fastq")
+    3
+
+    You might want to do this if the original file included extra line breaks,
+    which while valid may not be supported by all tools.  The output file from
+    Biopython will have each sequence on a single line, and each quality
+    string on a single line (which is considered desirable for maximum
+    compatibility).
+
+    In this next example, an old style Solexa/Illumina FASTQ file (using Solexa
+    quality scores) is converted into a standard Sanger style FASTQ file using
+    PHRED qualities:
+
+    >>> from Bio import SeqIO
+    >>> record_iterator = SeqIO.parse("Quality/solexa_example.fastq", "fastq-solexa")
+    >>> with open("Quality/temp.fastq", "w") as out_handle:
+    ...     SeqIO.write(record_iterator, out_handle, "fastq")
+    5
+
+    This code is also called if you use the .format("fastq") method of a
+    SeqRecord, or .format("fastq-sanger") if you prefer that alias.
+
+    Note that Sanger FASTQ files have an upper limit of PHRED quality 93, which is
+    encoded as ASCII 126, the tilde. If your quality scores are truncated to fit, a
+    warning is issued.
+
+    P.S. To avoid cluttering up your working directory, you can delete this
+    temporary file now:
+
+    >>> import os
+    >>> os.remove("Quality/temp.fastq")
+    """
+
+    assert SANGER_SCORE_OFFSET == ord("!")
+
+    def write_record(self, record):
+        """Write a single FASTQ record to the file."""
+        assert self._header_written
+        assert not self._footer_written
+        self._record_written = True
+        # TODO - Is an empty sequence allowed in FASTQ format?
+        seq = record.seq
+        if seq is None:
+            raise ValueError("No sequence for record %s" % record.id)
+        qualities_str = _get_sanger_quality_str(record)
+        if len(qualities_str) != len(seq):
+            raise ValueError(
+                "Record %s has sequence length %i but %i quality scores"
+                % (record.id, len(seq), len(qualities_str))
+            )
+
+        # FASTQ files can include a description, just like FASTA files
+        # (at least, this is what the NCBI Short Read Archive does)
+        id = self.clean(record.id)
+        description = self.clean(record.description)
+        if description and description.split(None, 1)[0] == id:
+            # The description includes the id at the start
+            title = description
+        elif description:
+            title = "%s %s" % (id, description)
+        else:
+            title = id
+
+        self.handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qualities_str))
+
+
+def as_fastq(record):
+    """Turn a SeqRecord into a Sanger FASTQ formatted string.
+
+    This is used internally by the SeqRecord's .format("fastq")
+    method and by the SeqIO.write(..., ..., "fastq") function,
+    and under the format alias "fastq-sanger" as well.
+    """
+    seq_str = _get_seq_string(record)
+    qualities_str = _get_sanger_quality_str(record)
+    if len(qualities_str) != len(seq_str):
+        raise ValueError(
+            "Record %s has sequence length %i but %i quality scores"
+            % (record.id, len(seq_str), len(qualities_str))
+        )
+    id = _clean(record.id)
+    description = _clean(record.description)
+    if description and description.split(None, 1)[0] == id:
+        title = description
+    elif description:
+        title = "%s %s" % (id, description)
+    else:
+        title = id
+    return "@%s\n%s\n+\n%s\n" % (title, seq_str, qualities_str)
+
+
+class QualPhredWriter(SequenceWriter):
+    """Class to write QUAL format files (using PHRED quality scores) (OBSOLETE).
+
+    Although you can use this class directly, you are strongly encouraged
+    to use the ``as_qual`` function, or top level ``Bio.SeqIO.write()``
+    function instead.
+
+    For example, this code reads in a FASTQ file and saves the quality scores
+    into a QUAL file:
+
+    >>> from Bio import SeqIO
+    >>> record_iterator = SeqIO.parse("Quality/example.fastq", "fastq")
+    >>> with open("Quality/temp.qual", "w") as out_handle:
+    ...     SeqIO.write(record_iterator, out_handle, "qual")
+    3
+
+    This code is also called if you use the .format("qual") method of a
+    SeqRecord.
+
+    P.S. Don't forget to clean up the temp file if you don't need it anymore:
+
+    >>> import os
+    >>> os.remove("Quality/temp.qual")
+    """
+
+    def __init__(self, handle, wrap=60, record2title=None):
+        """Create a QUAL writer.
+
+        Arguments:
+         - handle - Handle to an output file, e.g. as returned
+           by open(filename, "w")
+         - wrap   - Optional line length used to wrap sequence lines.
+           Defaults to wrapping the sequence at 60 characters. Use
+           zero (or None) for no wrapping, giving a single long line
+           for the sequence.
+         - record2title - Optional function to return the text to be
+           used for the title line of each record.  By default a
+           combination of the record.id and record.description is
+           used.  If the record.description starts with the record.id,
+           then just the record.description is used.
+
+        The record2title argument is present for consistency with the
+        Bio.SeqIO.FastaIO writer class.
+        """
+        super().__init__(handle)
+        # self.handle = handle
+        self.wrap = None
+        if wrap:
+            if wrap < 1:
+                raise ValueError
+        self.wrap = wrap
+        self.record2title = record2title
+
+    def write_record(self, record):
+        """Write a single QUAL record to the file."""
+        assert self._header_written
+        assert not self._footer_written
+        self._record_written = True
+
+        handle = self.handle
+        wrap = self.wrap
+
+        if self.record2title:
+            title = self.clean(self.record2title(record))
+        else:
+            id = self.clean(record.id)
+            description = self.clean(record.description)
+            if description and description.split(None, 1)[0] == id:
+                # The description includes the id at the start
+                title = description
+            elif description:
+                title = "%s %s" % (id, description)
+            else:
+                title = id
+        handle.write(">%s\n" % title)
+
+        qualities = _get_phred_quality(record)
+        try:
+            # This rounds to the nearest integer.
+            # TODO - can we record a float in a qual file?
+            qualities_strs = [("%i" % round(q, 0)) for q in qualities]
+        except TypeError:
+            if None in qualities:
+                raise TypeError("A quality value of None was found") from None
+            else:
+                raise
+
+        if wrap > 5:
+            # Fast wrapping
+            data = " ".join(qualities_strs)
+            while True:
+                if len(data) <= wrap:
+                    self.handle.write(data + "\n")
+                    break
+                else:
+                    # By construction there must be spaces in the first X chars
+                    # (unless we have X digit or higher quality scores!)
+                    i = data.rfind(" ", 0, wrap)
+                    handle.write(data[:i] + "\n")
+                    data = data[i + 1 :]
+        elif wrap:
+            # Safe wrapping
+            while qualities_strs:
+                line = qualities_strs.pop(0)
+                while qualities_strs and len(line) + 1 + len(qualities_strs[0]) < wrap:
+                    line += " " + qualities_strs.pop(0)
+                handle.write(line + "\n")
+        else:
+            # No wrapping
+            data = " ".join(qualities_strs)
+            handle.write(data + "\n")
+
+
+def as_qual(record):
+    """Turn a SeqRecord into a QUAL formatted string.
+
+    This is used internally by the SeqRecord's .format("qual")
+    method and by the SeqIO.write(..., ..., "qual") function.
+    """
+    id = _clean(record.id)
+    description = _clean(record.description)
+    if description and description.split(None, 1)[0] == id:
+        title = description
+    elif description:
+        title = "%s %s" % (id, description)
+    else:
+        title = id
+    lines = [">%s\n" % title]
+
+    qualities = _get_phred_quality(record)
+    try:
+        # This rounds to the nearest integer.
+        # TODO - can we record a float in a qual file?
+        qualities_strs = [("%i" % round(q, 0)) for q in qualities]
+    except TypeError:
+        if None in qualities:
+            raise TypeError("A quality value of None was found") from None
+        else:
+            raise
+
+    # Safe wrapping
+    while qualities_strs:
+        line = qualities_strs.pop(0)
+        while qualities_strs and len(line) + 1 + len(qualities_strs[0]) < 60:
+            line += " " + qualities_strs.pop(0)
+        lines.append(line + "\n")
+    return "".join(lines)
+
+
+class FastqSolexaWriter(SequenceWriter):
+    r"""Write old style Solexa/Illumina FASTQ format files (with Solexa qualities) (OBSOLETE).
+
+    This outputs FASTQ files like those from the early Solexa/Illumina
+    pipeline, using Solexa scores and an ASCII offset of 64. These are
+    NOT compatible with the standard Sanger style PHRED FASTQ files.
+
+    If your records contain a "solexa_quality" entry under letter_annotations,
+    this is used, otherwise any "phred_quality" entry will be used after
+    conversion using the solexa_quality_from_phred function. If neither style
+    of quality scores are present, an exception is raised.
+
+    Although you can use this class directly, you are strongly encouraged
+    to use the ``as_fastq_solexa`` function, or top-level ``Bio.SeqIO.write()``
+    function instead.  For example, this code reads in a FASTQ file and re-saves
+    it as another FASTQ file:
+
+    >>> from Bio import SeqIO
+    >>> record_iterator = SeqIO.parse("Quality/solexa_example.fastq", "fastq-solexa")
+    >>> with open("Quality/temp.fastq", "w") as out_handle:
+    ...     SeqIO.write(record_iterator, out_handle, "fastq-solexa")
+    5
+
+    You might want to do this if the original file included extra line breaks,
+    which (while valid) may not be supported by all tools.  The output file
+    from Biopython will have each sequence on a single line, and each quality
+    string on a single line (which is considered desirable for maximum
+    compatibility).
+
+    This code is also called if you use the .format("fastq-solexa") method of
+    a SeqRecord. For example,
+
+    >>> record = SeqIO.read("Quality/sanger_faked.fastq", "fastq-sanger")
+    >>> print(record.format("fastq-solexa"))
+    @Test PHRED qualities from 40 to 0 inclusive
+    ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN
+    +
+    hgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;
+    
+
+    Note that Solexa FASTQ files have an upper limit of Solexa quality 62, which is
+    encoded as ASCII 126, the tilde.  If your quality scores must be truncated to fit,
+    a warning is issued.
+
+    P.S. Don't forget to delete the temp file if you don't need it anymore:
+
+    >>> import os
+    >>> os.remove("Quality/temp.fastq")
+    """
+
+    def write_record(self, record):
+        """Write a single FASTQ record to the file."""
+        assert self._header_written
+        assert not self._footer_written
+        self._record_written = True
+
+        # TODO - Is an empty sequence allowed in FASTQ format?
+        seq = record.seq
+        if seq is None:
+            raise ValueError("No sequence for record %s" % record.id)
+        qualities_str = _get_solexa_quality_str(record)
+        if len(qualities_str) != len(seq):
+            raise ValueError(
+                "Record %s has sequence length %i but %i quality scores"
+                % (record.id, len(seq), len(qualities_str))
+            )
+
+        # FASTQ files can include a description, just like FASTA files
+        # (at least, this is what the NCBI Short Read Archive does)
+        id = self.clean(record.id)
+        description = self.clean(record.description)
+        if description and description.split(None, 1)[0] == id:
+            # The description includes the id at the start
+            title = description
+        elif description:
+            title = "%s %s" % (id, description)
+        else:
+            title = id
+
+        self.handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qualities_str))
+
+
+def as_fastq_solexa(record):
+    """Turn a SeqRecord into a Solexa FASTQ formatted string.
+
+    This is used internally by the SeqRecord's .format("fastq-solexa")
+    method and by the SeqIO.write(..., ..., "fastq-solexa") function.
+    """
+    seq_str = _get_seq_string(record)
+    qualities_str = _get_solexa_quality_str(record)
+    if len(qualities_str) != len(seq_str):
+        raise ValueError(
+            "Record %s has sequence length %i but %i quality scores"
+            % (record.id, len(seq_str), len(qualities_str))
+        )
+    id = _clean(record.id)
+    description = _clean(record.description)
+    if description and description.split(None, 1)[0] == id:
+        # The description includes the id at the start
+        title = description
+    elif description:
+        title = "%s %s" % (id, description)
+    else:
+        title = id
+    return "@%s\n%s\n+\n%s\n" % (title, seq_str, qualities_str)
+
+
+class FastqIlluminaWriter(SequenceWriter):
+    r"""Write Illumina 1.3+ FASTQ format files (with PHRED quality scores) (OBSOLETE).
+
+    This outputs FASTQ files like those from the Solexa/Illumina 1.3+ pipeline,
+    using PHRED scores and an ASCII offset of 64. Note these files are NOT
+    compatible with the standard Sanger style PHRED FASTQ files which use an
+    ASCII offset of 32.
+
+    Although you can use this class directly, you are strongly encouraged to
+    use the ``as_fastq_illumina`` or top-level ``Bio.SeqIO.write()`` function
+    with format name "fastq-illumina" instead. This code is also called if you
+    use the .format("fastq-illumina") method of a SeqRecord. For example,
+
+    >>> from Bio import SeqIO
+    >>> record = SeqIO.read("Quality/sanger_faked.fastq", "fastq-sanger")
+    >>> print(record.format("fastq-illumina"))
+    @Test PHRED qualities from 40 to 0 inclusive
+    ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN
+    +
+    hgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@
+    
+
+    Note that Illumina FASTQ files have an upper limit of PHRED quality 62, which is
+    encoded as ASCII 126, the tilde. If your quality scores are truncated to fit, a
+    warning is issued.
+    """
+
+    def write_record(self, record):
+        """Write a single FASTQ record to the file."""
+        assert self._header_written
+        assert not self._footer_written
+        self._record_written = True
+
+        # TODO - Is an empty sequence allowed in FASTQ format?
+        seq = record.seq
+        if seq is None:
+            raise ValueError("No sequence for record %s" % record.id)
+        qualities_str = _get_illumina_quality_str(record)
+        if len(qualities_str) != len(seq):
+            raise ValueError(
+                "Record %s has sequence length %i but %i quality scores"
+                % (record.id, len(seq), len(qualities_str))
+            )
+
+        # FASTQ files can include a description, just like FASTA files
+        # (at least, this is what the NCBI Short Read Archive does)
+        id = self.clean(record.id)
+        description = self.clean(record.description)
+        if description and description.split(None, 1)[0] == id:
+            # The description includes the id at the start
+            title = description
+        elif description:
+            title = "%s %s" % (id, description)
+        else:
+            title = id
+
+        self.handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qualities_str))
+
+
+def as_fastq_illumina(record):
+    """Turn a SeqRecord into an Illumina FASTQ formatted string.
+
+    This is used internally by the SeqRecord's .format("fastq-illumina")
+    method and by the SeqIO.write(..., ..., "fastq-illumina") function.
+    """
+    seq_str = _get_seq_string(record)
+    qualities_str = _get_illumina_quality_str(record)
+    if len(qualities_str) != len(seq_str):
+        raise ValueError(
+            "Record %s has sequence length %i but %i quality scores"
+            % (record.id, len(seq_str), len(qualities_str))
+        )
+    id = _clean(record.id)
+    description = _clean(record.description)
+    if description and description.split(None, 1)[0] == id:
+        title = description
+    elif description:
+        title = "%s %s" % (id, description)
+    else:
+        title = id
+    return "@%s\n%s\n+\n%s\n" % (title, seq_str, qualities_str)
+
+
+def PairedFastaQualIterator(fasta_source, qual_source, alphabet=None, title2ids=None):
+    """Iterate over matched FASTA and QUAL files as SeqRecord objects.
+
+    For example, consider this short QUAL file with PHRED quality scores::
+
+        >EAS54_6_R1_2_1_413_324
+        26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26
+        26 26 26 23 23
+        >EAS54_6_R1_2_1_540_792
+        26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26
+        26 18 26 23 18
+        >EAS54_6_R1_2_1_443_348
+        26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18
+        24 18 18 18 18
+
+    And a matching FASTA file::
+
+        >EAS54_6_R1_2_1_413_324
+        CCCTTCTTGTCTTCAGCGTTTCTCC
+        >EAS54_6_R1_2_1_540_792
+        TTGGCAGGCCAAGGCCGATGGATCA
+        >EAS54_6_R1_2_1_443_348
+        GTTGCTTCTGGCGTGGGTGGGGGGG
+
+    You can parse these separately using Bio.SeqIO with the "qual" and
+    "fasta" formats, but then you'll get a group of SeqRecord objects with
+    no sequence, and a matching group with the sequence but not the
+    qualities.  Because it only deals with one input file handle, Bio.SeqIO
+    can't be used to read the two files together - but this function can!
+    For example,
+
+    >>> with open("Quality/example.fasta") as f:
+    ...     with open("Quality/example.qual") as q:
+    ...         for record in PairedFastaQualIterator(f, q):
+    ...             print("%s %s" % (record.id, record.seq))
+    ...
+    EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
+    EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
+    EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG
+
+    As with the FASTQ or QUAL parsers, if you want to look at the qualities,
+    they are in each record's per-letter-annotation dictionary as a simple
+    list of integers:
+
+    >>> print(record.letter_annotations["phred_quality"])
+    [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+    If you have access to data as a FASTQ format file, using that directly
+    would be simpler and more straight forward.  Note that you can easily use
+    this function to convert paired FASTA and QUAL files into FASTQ files:
+
+    >>> from Bio import SeqIO
+    >>> with open("Quality/example.fasta") as f:
+    ...     with open("Quality/example.qual") as q:
+    ...         SeqIO.write(PairedFastaQualIterator(f, q), "Quality/temp.fastq", "fastq")
+    ...
+    3
+
+    And don't forget to clean up the temp file if you don't need it anymore:
+
+    >>> import os
+    >>> os.remove("Quality/temp.fastq")
+    """
+    if alphabet is not None:
+        raise ValueError("The alphabet argument is no longer supported")
+
+    from Bio.SeqIO.FastaIO import FastaIterator
+
+    fasta_iter = FastaIterator(fasta_source, title2ids=title2ids)
+    qual_iter = QualPhredIterator(qual_source, title2ids=title2ids)
+
+    # Using zip wouldn't load everything into memory, but also would not catch
+    # any extra records found in only one file.
+    while True:
+        try:
+            f_rec = next(fasta_iter)
+        except StopIteration:
+            f_rec = None
+        try:
+            q_rec = next(qual_iter)
+        except StopIteration:
+            q_rec = None
+        if f_rec is None and q_rec is None:
+            # End of both files
+            break
+        if f_rec is None:
+            raise ValueError("FASTA file has more entries than the QUAL file.")
+        if q_rec is None:
+            raise ValueError("QUAL file has more entries than the FASTA file.")
+        if f_rec.id != q_rec.id:
+            raise ValueError(
+                "FASTA and QUAL entries do not match (%s vs %s)." % (f_rec.id, q_rec.id)
+            )
+        if len(f_rec) != len(q_rec.letter_annotations["phred_quality"]):
+            raise ValueError(
+                "Sequence length and number of quality scores disagree for %s"
+                % f_rec.id
+            )
+        # Merge the data....
+        f_rec.letter_annotations["phred_quality"] = q_rec.letter_annotations[
+            "phred_quality"
+        ]
+        yield f_rec
+    # Done
+
+
+def _fastq_generic(in_file, out_file, mapping):
+    """FASTQ helper function where can't have data loss by truncation (PRIVATE)."""
+    # For real speed, don't even make SeqRecord and Seq objects!
+    count = 0
+    null = chr(0)
+    with as_handle(out_file, "w") as out_handle:
+        for title, seq, old_qual in FastqGeneralIterator(in_file):
+            count += 1
+            # map the qual...
+            qual = old_qual.translate(mapping)
+            if null in qual:
+                raise ValueError("Invalid character in quality string")
+            out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+    return count
+
+
+def _fastq_generic2(in_file, out_file, mapping, truncate_char, truncate_msg):
+    """FASTQ helper function where there could be data loss by truncation (PRIVATE)."""
+    # For real speed, don't even make SeqRecord and Seq objects!
+    count = 0
+    null = chr(0)
+    with as_handle(out_file, "w") as out_handle:
+        for title, seq, old_qual in FastqGeneralIterator(in_file):
+            count += 1
+            # map the qual...
+            qual = old_qual.translate(mapping)
+            if null in qual:
+                raise ValueError("Invalid character in quality string")
+            if truncate_char in qual:
+                qual = qual.replace(truncate_char, chr(126))
+                warnings.warn(truncate_msg, BiopythonWarning)
+            out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+    return count
+
+
+def _fastq_sanger_convert_fastq_sanger(in_file, out_file):
+    """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE).
+
+    Useful for removing line wrapping and the redundant second identifier
+    on the plus lines. Will check also check the quality string is valid.
+
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion.
+    """
+    # Map unexpected chars to null
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 33)]
+        + [chr(ascii) for ascii in range(33, 127)]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic(in_file, out_file, mapping)
+
+
+def _fastq_solexa_convert_fastq_solexa(in_file, out_file):
+    """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).
+
+    Useful for removing line wrapping and the redundant second identifier
+    on the plus lines. Will check also check the quality string is valid.
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion.
+    """
+    # Map unexpected chars to null
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 59)]
+        + [chr(ascii) for ascii in range(59, 127)]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic(in_file, out_file, mapping)
+
+
+def _fastq_illumina_convert_fastq_illumina(in_file, out_file):
+    """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
+
+    Useful for removing line wrapping and the redundant second identifier
+    on the plus lines. Will check also check the quality string is valid.
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion.
+    """
+    # Map unexpected chars to null
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 64)]
+        + [chr(ascii) for ascii in range(64, 127)]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic(in_file, out_file, mapping)
+
+
+def _fastq_illumina_convert_fastq_sanger(in_file, out_file):
+    """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE).
+
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion.
+    """
+    # Map unexpected chars to null
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 64)]
+        + [chr(33 + q) for q in range(0, 62 + 1)]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic(in_file, out_file, mapping)
+
+
+def _fastq_sanger_convert_fastq_illumina(in_file, out_file):
+    """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
+
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion. Will issue a warning if the scores had to be truncated at 62
+    (maximum possible in the Illumina 1.3+ FASTQ format)
+    """
+    # Map unexpected chars to null
+    trunc_char = chr(1)
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 33)]
+        + [chr(64 + q) for q in range(0, 62 + 1)]
+        + [trunc_char for ascii in range(96, 127)]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic2(
+        in_file,
+        out_file,
+        mapping,
+        trunc_char,
+        "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ",
+    )
+
+
+def _fastq_solexa_convert_fastq_sanger(in_file, out_file):
+    """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE).
+
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion.
+    """
+    # Map unexpected chars to null
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 59)]
+        + [
+            chr(33 + int(round(phred_quality_from_solexa(q))))
+            for q in range(-5, 62 + 1)
+        ]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic(in_file, out_file, mapping)
+
+
+def _fastq_sanger_convert_fastq_solexa(in_file, out_file):
+    """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE).
+
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion. Will issue a warning if the scores had to be truncated at 62
+    (maximum possible in the Solexa FASTQ format)
+    """
+    # Map unexpected chars to null
+    trunc_char = chr(1)
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 33)]
+        + [chr(64 + int(round(solexa_quality_from_phred(q)))) for q in range(0, 62 + 1)]
+        + [trunc_char for ascii in range(96, 127)]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic2(
+        in_file,
+        out_file,
+        mapping,
+        trunc_char,
+        "Data loss - max Solexa quality 62 in Solexa FASTQ",
+    )
+
+
+def _fastq_solexa_convert_fastq_illumina(in_file, out_file):
+    """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
+
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion.
+    """
+    # Map unexpected chars to null
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 59)]
+        + [
+            chr(64 + int(round(phred_quality_from_solexa(q))))
+            for q in range(-5, 62 + 1)
+        ]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic(in_file, out_file, mapping)
+
+
+def _fastq_illumina_convert_fastq_solexa(in_file, out_file):
+    """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE).
+
+    Avoids creating SeqRecord and Seq objects in order to speed up this
+    conversion.
+    """
+    # Map unexpected chars to null
+    mapping = "".join(
+        [chr(0) for ascii in range(0, 64)]
+        + [chr(64 + int(round(solexa_quality_from_phred(q)))) for q in range(0, 62 + 1)]
+        + [chr(0) for ascii in range(127, 256)]
+    )
+    assert len(mapping) == 256
+    return _fastq_generic(in_file, out_file, mapping)
+
+
+def _fastq_convert_fasta(in_file, out_file):
+    """Fast FASTQ to FASTA conversion (PRIVATE).
+
+    Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
+    Seq objects in order to speed up this conversion.
+
+    NOTE - This does NOT check the characters used in the FASTQ quality string
+    are valid!
+    """
+    # For real speed, don't even make SeqRecord and Seq objects!
+    count = 0
+    with as_handle(out_file, "w") as out_handle:
+        for title, seq, qual in FastqGeneralIterator(in_file):
+            count += 1
+            out_handle.write(">%s\n" % title)
+            # Do line wrapping
+            for i in range(0, len(seq), 60):
+                out_handle.write(seq[i : i + 60] + "\n")
+    return count
+
+
+def _fastq_convert_tab(in_file, out_file):
+    """Fast FASTQ to simple tabbed conversion (PRIVATE).
+
+    Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
+    Seq objects in order to speed up this conversion.
+
+    NOTE - This does NOT check the characters used in the FASTQ quality string
+    are valid!
+    """
+    # For real speed, don't even make SeqRecord and Seq objects!
+    count = 0
+    with as_handle(out_file, "w") as out_handle:
+        for title, seq, qual in FastqGeneralIterator(in_file):
+            count += 1
+            out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq))
+    return count
+
+
+def _fastq_convert_qual(in_file, out_file, mapping):
+    """FASTQ helper function for QUAL output (PRIVATE).
+
+    Mapping should be a dictionary mapping expected ASCII characters from the
+    FASTQ quality string to PHRED quality scores (as strings).
+    """
+    # For real speed, don't even make SeqRecord and Seq objects!
+    count = 0
+    with as_handle(out_file, "w") as out_handle:
+        for title, seq, qual in FastqGeneralIterator(in_file):
+            count += 1
+            out_handle.write(">%s\n" % title)
+            # map the qual... note even with Sanger encoding max 2 digits
+            try:
+                qualities_strs = [mapping[ascii] for ascii in qual]
+            except KeyError:
+                raise ValueError("Invalid character in quality string") from None
+            data = " ".join(qualities_strs)
+            while len(data) > 60:
+                # Know quality scores are either 1 or 2 digits, so there
+                # must be a space in any three consecutive characters.
+                if data[60] == " ":
+                    out_handle.write(data[:60] + "\n")
+                    data = data[61:]
+                elif data[59] == " ":
+                    out_handle.write(data[:59] + "\n")
+                    data = data[60:]
+                else:
+                    assert data[58] == " ", "Internal logic failure in wrapping"
+                    out_handle.write(data[:58] + "\n")
+                    data = data[59:]
+            out_handle.write(data + "\n")
+    return count
+
+
+def _fastq_sanger_convert_qual(in_file, out_file):
+    """Fast Sanger FASTQ to QUAL conversion (PRIVATE)."""
+    mapping = {chr(q + 33): str(q) for q in range(0, 93 + 1)}
+    return _fastq_convert_qual(in_file, out_file, mapping)
+
+
+def _fastq_solexa_convert_qual(in_file, out_file):
+    """Fast Solexa FASTQ to QUAL conversion (PRIVATE)."""
+    mapping = {
+        chr(q + 64): str(int(round(phred_quality_from_solexa(q))))
+        for q in range(-5, 62 + 1)
+    }
+    return _fastq_convert_qual(in_file, out_file, mapping)
+
+
+def _fastq_illumina_convert_qual(in_file, out_file):
+    """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE)."""
+    mapping = {chr(q + 64): str(q) for q in range(0, 62 + 1)}
+    return _fastq_convert_qual(in_file, out_file, mapping)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/SeqXmlIO.py b/code/lib/Bio/SeqIO/SeqXmlIO.py
new file mode 100644
index 0000000..c4d15f6
--- /dev/null
+++ b/code/lib/Bio/SeqIO/SeqXmlIO.py
@@ -0,0 +1,669 @@
+# Copyright 2010 by Thomas Schmitt.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "seqxml" file format, SeqXML.
+
+This module is for reading and writing SeqXML format files as
+SeqRecord objects, and is expected to be used via the Bio.SeqIO API.
+
+SeqXML is a lightweight XML format which is supposed be an alternative for
+FASTA files. For more Information see http://www.seqXML.org and Schmitt et al
+(2011), https://doi.org/10.1093/bib/bbr025
+"""
+from xml import sax
+from xml.sax import handler
+from xml.sax.saxutils import XMLGenerator
+from xml.sax.xmlreader import AttributesImpl
+
+from Bio.Seq import Seq
+from Bio.Seq import UnknownSeq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+class ContentHandler(handler.ContentHandler):
+    """Handles XML events generated by the parser (PRIVATE)."""
+
+    def __init__(self):
+        """Create a handler to handle XML events."""
+        super().__init__()
+        self.source = None
+        self.sourceVersion = None
+        self.seqXMLversion = None
+        self.ncbiTaxID = None
+        self.speciesName = None
+        self.startElementNS = None
+        self.data = None
+        self.records = []
+
+    def startDocument(self):
+        """Set XML handlers when an XML declaration is found."""
+        self.startElementNS = self.startSeqXMLElement
+
+    def startSeqXMLElement(self, name, qname, attrs):
+        """Handle start of a seqXML element."""
+        if name != (None, "seqXML"):
+            raise ValueError("Failed to find the start of seqXML element")
+        if qname is not None:
+            raise RuntimeError("Unexpected qname for seqXML element")
+        schema = None
+        for key, value in attrs.items():
+            namespace, localname = key
+            if namespace is None:
+                if localname == "source":
+                    self.source = value
+                elif localname == "sourceVersion":
+                    self.sourceVersion = value
+                elif localname == "seqXMLversion":
+                    self.seqXMLversion = value
+                elif localname == "ncbiTaxID":
+                    # check if it is an integer, but store as string
+                    number = int(value)
+                    self.ncbiTaxID = value
+                elif localname == "speciesName":
+                    self.speciesName = value
+                else:
+                    raise ValueError("Unexpected attribute for XML Schema")
+            elif namespace == "http://www.w3.org/2001/XMLSchema-instance":
+                if localname == "noNamespaceSchemaLocation":
+                    schema = value
+                else:
+                    raise ValueError("Unexpected attribute for XML Schema in namespace")
+            else:
+                raise ValueError(
+                    "Unexpected namespace '%s' for seqXML attribute" % namespace
+                )
+        if self.seqXMLversion is None:
+            raise ValueError("Failed to find seqXMLversion")
+        url = "http://www.seqxml.org/%s/seqxml.xsd" % self.seqXMLversion
+        if schema != url:
+            raise ValueError(
+                "XML Schema '%s' found not consistent with reported seqXML version %s"
+                % (schema, self.seqXMLversion)
+            )
+        self.endElementNS = self.endSeqXMLElement
+        self.startElementNS = self.startEntryElement
+
+    def endSeqXMLElement(self, name, qname):
+        """Handle end of the seqXML element."""
+        namespace, localname = name
+        if namespace is not None:
+            raise RuntimeError("Unexpected namespace '%s' for seqXML end" % namespace)
+        if qname is not None:
+            raise RuntimeError("Unexpected qname '%s' for seqXML end" % qname)
+        if localname != "seqXML":
+            raise RuntimeError("Failed to find end of seqXML element")
+        self.startElementNS = None
+        self.endElementNS = None
+
+    def startEntryElement(self, name, qname, attrs):
+        """Set new entry with id and the optional entry source (PRIVATE)."""
+        if name != (None, "entry"):
+            raise ValueError("Expected to find the start of an entry element")
+        if qname is not None:
+            raise RuntimeError("Unexpected qname for entry element")
+        record = SeqRecord("", id=None)
+        if self.speciesName is not None:
+            record.annotations["organism"] = self.speciesName
+        if self.ncbiTaxID is not None:
+            record.annotations["ncbi_taxid"] = self.ncbiTaxID
+        record.annotations["source"] = self.source
+        for key, value in attrs.items():
+            namespace, localname = key
+            if namespace is None:
+                if localname == "id":
+                    record.id = value
+                elif localname == "source":
+                    record.annotations["source"] = value
+                else:
+                    raise ValueError(
+                        "Unexpected attribute %s in entry element" % localname
+                    )
+            else:
+                raise ValueError(
+                    "Unexpected namespace '%s' for entry attribute" % namespace
+                )
+        if record.id is None:
+            raise ValueError("Failed to find entry ID")
+        self.records.append(record)
+        self.startElementNS = self.startEntryFieldElement
+        self.endElementNS = self.endEntryElement
+
+    def endEntryElement(self, name, qname):
+        """Handle end of an entry element."""
+        if name != (None, "entry"):
+            raise ValueError("Expected to find the end of an entry element")
+        if qname is not None:
+            raise RuntimeError("Unexpected qname for entry element")
+        self.startElementNS = self.startEntryElement
+        self.endElementNS = self.endSeqXMLElement
+
+    def startEntryFieldElement(self, name, qname, attrs):
+        """Receive a field of an entry element and forward it."""
+        namespace, localname = name
+        if namespace is not None:
+            raise ValueError(
+                "Unexpected namespace '%s' for %s element" % (namespace, localname)
+            )
+        if qname is not None:
+            raise RuntimeError(
+                "Unexpected qname '%s' for %s element" % (qname, localname)
+            )
+        if localname == "species":
+            return self.startSpeciesElement(attrs)
+        if localname == "description":
+            return self.startDescriptionElement(attrs)
+        if localname in ("DNAseq", "RNAseq", "AAseq"):
+            return self.startSequenceElement(attrs)
+        if localname == "DBRef":
+            return self.startDBRefElement(attrs)
+        if localname == "property":
+            return self.startPropertyElement(attrs)
+        raise ValueError("Unexpected field %s in entry" % localname)
+
+    def startSpeciesElement(self, attrs):
+        """Parse the species information."""
+        name = None
+        ncbiTaxID = None
+        for key, value in attrs.items():
+            namespace, localname = key
+            if namespace is None:
+                if localname == "name":
+                    name = value
+                elif localname == "ncbiTaxID":
+                    # check if it is an integer, but store as string
+                    number = int(value)
+                    ncbiTaxID = value
+                else:
+                    raise ValueError(
+                        "Unexpected attribute '%s' found in species tag" % key
+                    )
+            else:
+                raise ValueError(
+                    "Unexpected namespace '%s' for species attribute" % namespace
+                )
+        # The attributes "name" and "ncbiTaxID" are required:
+        if name is None:
+            raise ValueError("Failed to find species name")
+        if ncbiTaxID is None:
+            raise ValueError("Failed to find ncbiTaxId")
+        record = self.records[-1]
+        # The keywords for the species annotation are taken from SwissIO
+        record.annotations["organism"] = name
+        # TODO - Should have been a list to match SwissProt parser:
+        record.annotations["ncbi_taxid"] = ncbiTaxID
+        self.endElementNS = self.endSpeciesElement
+
+    def endSpeciesElement(self, name, qname):
+        """Handle end of a species element."""
+        namespace, localname = name
+        if namespace is not None:
+            raise RuntimeError("Unexpected namespace '%s' for species end" % namespace)
+        if qname is not None:
+            raise RuntimeError("Unexpected qname '%s' for species end" % qname)
+        if localname != "species":
+            raise RuntimeError("Failed to find end of species element")
+        self.endElementNS = self.endEntryElement
+
+    def startDescriptionElement(self, attrs):
+        """Parse the description."""
+        if attrs:
+            raise ValueError("Unexpected attributes found in description element")
+        if self.data is not None:
+            raise RuntimeError("Unexpected data found: '%s'" % self.data)
+        self.data = ""
+        self.endElementNS = self.endDescriptionElement
+
+    def endDescriptionElement(self, name, qname):
+        """Handle the end of a description element."""
+        namespace, localname = name
+        if namespace is not None:
+            raise RuntimeError(
+                "Unexpected namespace '%s' for description end" % namespace
+            )
+        if qname is not None:
+            raise RuntimeError("Unexpected qname '%s' for description end" % qname)
+        if localname != "description":
+            raise RuntimeError("Failed to find end of description element")
+        record = self.records[-1]
+        description = self.data
+        if description:  # ignore if empty string
+            record.description = description
+        self.data = None
+        self.endElementNS = self.endEntryElement
+
+    def startSequenceElement(self, attrs):
+        """Parse DNA, RNA, or protein sequence."""
+        if attrs:
+            raise ValueError("Unexpected attributes found in sequence element")
+        if self.data is not None:
+            raise RuntimeError("Unexpected data found: '%s'" % self.data)
+        self.data = ""
+        self.endElementNS = self.endSequenceElement
+
+    def endSequenceElement(self, name, qname):
+        """Handle the end of a sequence element."""
+        namespace, localname = name
+        if namespace is not None:
+            raise RuntimeError("Unexpected namespace '%s' for sequence end" % namespace)
+        if qname is not None:
+            raise RuntimeError("Unexpected qname '%s' for sequence end" % qname)
+        record = self.records[-1]
+        if localname == "DNAseq":
+            record.annotations["molecule_type"] = "DNA"
+        elif localname == "RNAseq":
+            record.annotations["molecule_type"] = "RNA"
+        elif localname == "AAseq":
+            record.annotations["molecule_type"] = "protein"
+        else:
+            raise RuntimeError(
+                "Failed to find end of sequence (localname = %s)" % localname
+            )
+        record.seq = Seq(self.data)
+        self.data = None
+        self.endElementNS = self.endEntryElement
+
+    def startDBRefElement(self, attrs):
+        """Parse a database cross reference."""
+        source = None
+        ID = None
+        for key, value in attrs.items():
+            namespace, localname = key
+            if namespace is None:
+                if localname == "source":
+                    source = value
+                elif localname == "id":
+                    ID = value
+                else:
+                    raise ValueError(
+                        "Unexpected attribute '%s' found for DBRef element" % key
+                    )
+            else:
+                raise ValueError(
+                    "Unexpected namespace '%s' for DBRef attribute" % namespace
+                )
+        # The attributes "source" and "id" are required:
+        if source is None:
+            raise ValueError("Failed to find source for DBRef element")
+        if ID is None:
+            raise ValueError("Failed to find id for DBRef element")
+        if self.data is not None:
+            raise RuntimeError("Unexpected data found: '%s'" % self.data)
+        self.data = ""
+        record = self.records[-1]
+        dbxref = "%s:%s" % (source, ID)
+        if dbxref not in record.dbxrefs:
+            record.dbxrefs.append(dbxref)
+        self.endElementNS = self.endDBRefElement
+
+    def endDBRefElement(self, name, qname):
+        """Handle the end of a DBRef element."""
+        namespace, localname = name
+        if namespace is not None:
+            raise RuntimeError(
+                "Unexpected namespace '%s' for DBRef element" % namespace
+            )
+        if qname is not None:
+            raise RuntimeError("Unexpected qname '%s' for DBRef element" % qname)
+        if localname != "DBRef":
+            raise RuntimeError(
+                "Unexpected localname '%s' for DBRef element" % localname
+            )
+        if self.data:
+            raise RuntimeError(
+                "Unexpected data received for DBRef element: '%s'" % self.data
+            )
+        self.data = None
+        self.endElementNS = self.endEntryElement
+
+    def startPropertyElement(self, attrs):
+        """Handle the start of a property element."""
+        property_name = None
+        property_value = None
+        for key, value in attrs.items():
+            namespace, localname = key
+            if namespace is None:
+                if localname == "name":
+                    property_name = value
+                elif localname == "value":
+                    property_value = value
+                else:
+                    raise ValueError(
+                        "Unexpected attribute '%s' found for property element", key
+                    )
+            else:
+                raise ValueError(
+                    "Unexpected namespace '%s' for property attribute" % namespace
+                )
+        # The attribute "name" is required:
+        if property_name is None:
+            raise ValueError("Failed to find name for property element")
+        record = self.records[-1]
+        if property_name == "molecule_type":
+            # At this point, record.annotations["molecule_type"] is either
+            # "DNA", "RNA", or "protein"; property_value may be a more detailed
+            # description such as "mRNA" or "genomic DNA".
+            assert record.annotations[property_name] in property_value
+            record.annotations[property_name] = property_value
+        else:
+            if property_name not in record.annotations:
+                record.annotations[property_name] = []
+            record.annotations[property_name].append(property_value)
+        self.endElementNS = self.endPropertyElement
+
+    def endPropertyElement(self, name, qname):
+        """Handle the end of a property element."""
+        namespace, localname = name
+        if namespace is not None:
+            raise RuntimeError(
+                "Unexpected namespace '%s' for property element" % namespace
+            )
+        if qname is not None:
+            raise RuntimeError("Unexpected qname '%s' for property element" % qname)
+        if localname != "property":
+            raise RuntimeError(
+                "Unexpected localname '%s' for property element" % localname
+            )
+        self.endElementNS = self.endEntryElement
+
+    def characters(self, data):
+        """Handle character data."""
+        if self.data is not None:
+            self.data += data
+
+
+class SeqXmlIterator(SequenceIterator):
+    """Parser for seqXML files.
+
+    Parses seqXML files and creates SeqRecords.
+    Assumes valid seqXML please validate beforehand.
+    It is assumed that all information for one record can be found within a
+    record element or above. Two types of methods are called when the start
+    tag of an element is reached. To receive only the attributes of an
+    element before its end tag is reached implement _attr_TAGNAME.
+    To get an element and its children as a DOM tree implement _elem_TAGNAME.
+    Everything that is part of the DOM tree will not trigger any further
+    method calls.
+    """
+
+    BLOCK = 1024
+
+    def __init__(self, stream_or_path, namespace=None):
+        """Create the object and initialize the XML parser."""
+        # Make sure we got a binary handle. If we got a text handle, then
+        # the parser will still run but unicode characters will be garbled
+        # if the text handle was opened with a different encoding than the
+        # one specified in the XML file. With a binary handle, the correct
+        # encoding is picked up by the parser from the XML file.
+        self.parser = sax.make_parser()
+        content_handler = ContentHandler()
+        self.parser.setContentHandler(content_handler)
+        self.parser.setFeature(handler.feature_namespaces, True)
+        super().__init__(stream_or_path, mode="b", fmt="SeqXML")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        parser = self.parser
+        content_handler = parser.getContentHandler()
+        BLOCK = self.BLOCK
+        while True:
+            # Read in another block of the file...
+            text = handle.read(BLOCK)
+            if not text:
+                if content_handler.startElementNS is None:
+                    raise ValueError("Empty file.")
+                else:
+                    raise ValueError("XML file contains no data.")
+            parser.feed(text)
+            seqXMLversion = content_handler.seqXMLversion
+            if seqXMLversion is not None:
+                break
+        self.seqXMLversion = seqXMLversion
+        self.source = content_handler.source
+        self.sourceVersion = content_handler.sourceVersion
+        self.ncbiTaxID = content_handler.ncbiTaxID
+        self.speciesName = content_handler.speciesName
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Iterate over the records in the XML file."""
+        parser = self.parser
+        content_handler = parser.getContentHandler()
+        records = content_handler.records
+        BLOCK = self.BLOCK
+        while True:
+            if len(records) > 1:
+                # Then at least the first record is finished
+                record = records.pop(0)
+                yield record
+            # Read in another block of the file...
+            text = handle.read(BLOCK)
+            if not text:
+                break
+            parser.feed(text)
+        # We have reached the end of the XML file;
+        # send out the remaining records
+        yield from records
+        records.clear()
+        parser.close()
+
+
+class SeqXmlWriter(SequenceWriter):
+    """Writes SeqRecords into seqXML file.
+
+    SeqXML requires the SeqRecord annotations to specify the molecule_type;
+    the molecule type is required to contain the term "DNA", "RNA", or
+    "protein".
+    """
+
+    def __init__(
+        self, target, source=None, source_version=None, species=None, ncbiTaxId=None
+    ):
+        """Create Object and start the xml generator.
+
+        Arguments:
+         - target - Output stream opened in binary mode, or a path to a file.
+         - source - The source program/database of the file, for example
+           UniProt.
+         - source_version - The version or release number of the source
+           program or database from which the data originated.
+         - species - The scientific name of the species of origin of all
+           entries in the file.
+         - ncbiTaxId - The NCBI taxonomy identifier of the species of origin.
+
+        """
+        super().__init__(target, "wb")
+        handle = self.handle
+        self.xml_generator = XMLGenerator(handle, "utf-8")
+        self.xml_generator.startDocument()
+        self.source = source
+        self.source_version = source_version
+        self.species = species
+        self.ncbiTaxId = ncbiTaxId
+
+    def write_header(self):
+        """Write root node with document metadata."""
+        attrs = {
+            "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
+            "xsi:noNamespaceSchemaLocation": "http://www.seqxml.org/0.4/seqxml.xsd",
+            "seqXMLversion": "0.4",
+        }
+
+        if self.source is not None:
+            attrs["source"] = self.source
+        if self.source_version is not None:
+            attrs["sourceVersion"] = self.source_version
+        if self.species is not None:
+            if not isinstance(self.species, str):
+                raise TypeError("species should be of type string")
+            attrs["speciesName"] = self.species
+        if self.ncbiTaxId is not None:
+            if not isinstance(self.ncbiTaxId, (str, int)):
+                raise TypeError("ncbiTaxID should be of type string or int")
+            attrs["ncbiTaxID"] = self.ncbiTaxId
+
+        self.xml_generator.startElement("seqXML", AttributesImpl(attrs))
+
+    def write_record(self, record):
+        """Write one record."""
+        if not record.id or record.id == "":
+            raise ValueError("SeqXML requires identifier")
+
+        if not isinstance(record.id, str):
+            raise TypeError("Identifier should be of type string")
+
+        attrb = {"id": record.id}
+
+        if (
+            "source" in record.annotations
+            and self.source != record.annotations["source"]
+        ):
+            if not isinstance(record.annotations["source"], str):
+                raise TypeError("source should be of type string")
+            attrb["source"] = record.annotations["source"]
+
+        self.xml_generator.startElement("entry", AttributesImpl(attrb))
+        self._write_species(record)
+        self._write_description(record)
+        self._write_seq(record)
+        self._write_dbxrefs(record)
+        self._write_properties(record)
+        self.xml_generator.endElement("entry")
+
+    def write_footer(self):
+        """Close the root node and finish the XML document."""
+        self.xml_generator.endElement("seqXML")
+        self.xml_generator.endDocument()
+
+    def _write_species(self, record):
+        """Write the species if given (PRIVATE)."""
+        local_ncbi_taxid = None
+        if "ncbi_taxid" in record.annotations:
+            local_ncbi_taxid = record.annotations["ncbi_taxid"]
+            if isinstance(local_ncbi_taxid, list):
+                # SwissProt parser uses a list (which could cope with chimeras)
+                if len(local_ncbi_taxid) == 1:
+                    local_ncbi_taxid = local_ncbi_taxid[0]
+                elif len(local_ncbi_taxid) == 0:
+                    local_ncbi_taxid = None
+                else:
+                    raise ValueError(
+                        "Multiple entries for record.annotations['ncbi_taxid'], %r"
+                        % local_ncbi_taxid
+                    )
+        if "organism" in record.annotations and local_ncbi_taxid:
+            local_org = record.annotations["organism"]
+
+            if not isinstance(local_org, str):
+                raise TypeError("organism should be of type string")
+
+            if not isinstance(local_ncbi_taxid, (str, int)):
+                raise TypeError("ncbiTaxID should be of type string or int")
+
+            # The local species definition is only written if it differs from the global species definition
+            if local_org != self.species or local_ncbi_taxid != self.ncbiTaxId:
+
+                attr = {"name": local_org, "ncbiTaxID": str(local_ncbi_taxid)}
+                self.xml_generator.startElement("species", AttributesImpl(attr))
+                self.xml_generator.endElement("species")
+
+    def _write_description(self, record):
+        """Write the description if given (PRIVATE)."""
+        if record.description:
+
+            if not isinstance(record.description, str):
+                raise TypeError("Description should be of type string")
+
+            description = record.description
+            if description == "":
+                description = ""
+
+            if len(record.description) > 0:
+                self.xml_generator.startElement("description", AttributesImpl({}))
+                self.xml_generator.characters(description)
+                self.xml_generator.endElement("description")
+
+    def _write_seq(self, record):
+        """Write the sequence (PRIVATE).
+
+        Note that SeqXML requires the molecule type to contain the term
+        "DNA", "RNA", or "protein".
+        """
+        if isinstance(record.seq, UnknownSeq):
+            raise TypeError("Sequence type is UnknownSeq but SeqXML requires sequence")
+
+        seq = bytes(record.seq)
+
+        if not len(seq) > 0:
+            raise ValueError("The sequence length should be greater than 0")
+
+        molecule_type = record.annotations.get("molecule_type")
+        if molecule_type is None:
+            raise ValueError("molecule_type is not defined")
+        elif "DNA" in molecule_type:
+            seqElem = "DNAseq"
+        elif "RNA" in molecule_type:
+            seqElem = "RNAseq"
+        elif "protein" in molecule_type:
+            seqElem = "AAseq"
+        else:
+            raise ValueError("unknown molecule_type '%s'" % molecule_type)
+
+        self.xml_generator.startElement(seqElem, AttributesImpl({}))
+        self.xml_generator.characters(seq)
+        self.xml_generator.endElement(seqElem)
+
+    def _write_dbxrefs(self, record):
+        """Write all database cross references (PRIVATE)."""
+        if record.dbxrefs is not None:
+
+            for dbxref in record.dbxrefs:
+
+                if not isinstance(dbxref, str):
+                    raise TypeError("dbxrefs should be of type list of string")
+                if dbxref.find(":") < 1:
+                    raise ValueError(
+                        "dbxrefs should be in the form ['source:id', 'source:id' ]"
+                    )
+
+                dbsource, dbid = dbxref.split(":", 1)
+
+                attr = {"source": dbsource, "id": dbid}
+                self.xml_generator.startElement("DBRef", AttributesImpl(attr))
+                self.xml_generator.endElement("DBRef")
+
+    def _write_properties(self, record):
+        """Write all annotations that are key value pairs with values of a primitive type or list of primitive types (PRIVATE)."""
+        for key, value in record.annotations.items():
+
+            if key not in ("organism", "ncbi_taxid", "source"):
+
+                if value is None:
+
+                    attr = {"name": key}
+                    self.xml_generator.startElement("property", AttributesImpl(attr))
+                    self.xml_generator.endElement("property")
+
+                elif isinstance(value, list):
+
+                    for v in value:
+                        if v is None:
+                            attr = {"name": key}
+                        else:
+                            attr = {"name": key, "value": str(v)}
+                        self.xml_generator.startElement(
+                            "property", AttributesImpl(attr)
+                        )
+                        self.xml_generator.endElement("property")
+
+                elif isinstance(value, (int, float, str)):
+
+                    attr = {"name": key, "value": str(value)}
+                    self.xml_generator.startElement("property", AttributesImpl(attr))
+                    self.xml_generator.endElement("property")
diff --git a/code/lib/Bio/SeqIO/SffIO.py b/code/lib/Bio/SeqIO/SffIO.py
new file mode 100644
index 0000000..18d3ab1
--- /dev/null
+++ b/code/lib/Bio/SeqIO/SffIO.py
@@ -0,0 +1,1494 @@
+# Copyright 2009-2020 by Peter Cock.  All rights reserved.
+# Based on code contributed and copyright 2009 by Jose Blanca (COMAV-UPV).
+#
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+"""Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format.
+
+SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for
+Biomedical Research and the Wellcome Trust Sanger Institute. SFF was also used
+as the native output format from early versions of Ion Torrent's PGM platform
+as well. You are expected to use this module via the Bio.SeqIO functions under
+the format name "sff" (or "sff-trim" as described below).
+
+For example, to iterate over the records in an SFF file,
+
+    >>> from Bio import SeqIO
+    >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
+    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
+    ...
+    E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT...
+    E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA...
+    E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
+    E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT...
+    E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA...
+    E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC...
+    E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA...
+    E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG...
+    E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC...
+    E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA...
+
+Each SeqRecord object will contain all the annotation from the SFF file,
+including the PHRED quality scores.
+
+    >>> print("%s %i" % (record.id, len(record)))
+    E3MFGYR02F7Z7G 219
+    >>> print("%s..." % record.seq[:10])
+    tcagAATCAT...
+    >>> print("%r..." % (record.letter_annotations["phred_quality"][:10]))
+    [22, 21, 23, 28, 26, 15, 12, 21, 28, 21]...
+
+Notice that the sequence is given in mixed case, the central upper case region
+corresponds to the trimmed sequence. This matches the output of the Roche
+tools (and the 3rd party tool sff_extract) for SFF to FASTA.
+
+    >>> print(record.annotations["clip_qual_left"])
+    4
+    >>> print(record.annotations["clip_qual_right"])
+    134
+    >>> print(record.seq[:4])
+    tcag
+    >>> print("%s...%s" % (record.seq[4:20], record.seq[120:134]))
+    AATCATCCACTTTTTA...CAAAACACAAACAG
+    >>> print(record.seq[134:])
+    atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn
+
+The annotations dictionary also contains any adapter clip positions
+(usually zero), and information about the flows. e.g.
+
+    >>> len(record.annotations)
+    12
+    >>> print(record.annotations["flow_key"])
+    TCAG
+    >>> print(record.annotations["flow_values"][:10])
+    (83, 1, 128, 7, 4, 84, 6, 106, 3, 172)
+    >>> print(len(record.annotations["flow_values"]))
+    400
+    >>> print(record.annotations["flow_index"][:10])
+    (1, 2, 3, 2, 2, 0, 3, 2, 3, 3)
+    >>> print(len(record.annotations["flow_index"]))
+    219
+
+Note that to convert from a raw reading in flow_values to the corresponding
+homopolymer stretch estimate, the value should be rounded to the nearest 100:
+
+    >>> print("%r..." % [int(round(value, -2)) // 100
+    ...                  for value in record.annotations["flow_values"][:10]])
+    ...
+    [1, 0, 1, 0, 0, 1, 0, 1, 0, 2]...
+
+If a read name is exactly 14 alphanumeric characters, the annotations
+dictionary will also contain meta-data about the read extracted by
+interpretting the name as a 454 Sequencing System "Universal" Accession
+Number. Note that if a read name happens to be exactly 14 alphanumeric
+characters but was not generated automatically, these annotation records
+will contain nonsense information.
+
+    >>> print(record.annotations["region"])
+    2
+    >>> print(record.annotations["time"])
+    [2008, 1, 9, 16, 16, 0]
+    >>> print(record.annotations["coords"])
+    (2434, 1658)
+
+As a convenience method, you can read the file with SeqIO format name "sff-trim"
+instead of "sff" to get just the trimmed sequences (without any annotation
+except for the PHRED quality scores and anything encoded in the read names):
+
+    >>> from Bio import SeqIO
+    >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"):
+    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
+    ...
+    E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC...
+    E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC...
+    E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
+    E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT...
+    E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA...
+    E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC...
+    E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC...
+    E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC...
+    E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG...
+    E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT...
+
+Looking at the final record in more detail, note how this differs to the
+example above:
+
+    >>> print("%s %i" % (record.id, len(record)))
+    E3MFGYR02F7Z7G 130
+    >>> print("%s..." % record.seq[:10])
+    AATCATCCAC...
+    >>> print("%r..." % record.letter_annotations["phred_quality"][:10])
+    [26, 15, 12, 21, 28, 21, 36, 28, 27, 27]...
+    >>> len(record.annotations)
+    4
+    >>> print(record.annotations["region"])
+    2
+    >>> print(record.annotations["coords"])
+    (2434, 1658)
+    >>> print(record.annotations["time"])
+    [2008, 1, 9, 16, 16, 0]
+    >>> print(record.annotations["molecule_type"])
+    DNA
+
+You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF
+reads into a FASTQ file (or a FASTA file and a QUAL file), e.g.
+
+    >>> from Bio import SeqIO
+    >>> from io import StringIO
+    >>> out_handle = StringIO()
+    >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff",
+    ...                       out_handle, "fastq")
+    ...
+    >>> print("Converted %i records" % count)
+    Converted 10 records
+
+The output FASTQ file would start like this:
+
+    >>> print("%s..." % out_handle.getvalue()[:50])
+    @E3MFGYR02JWQ7T
+    tcagGGTCTACATGTTGGTTAACCCGTACTGATT...
+
+Bio.SeqIO.index() provides memory efficient random access to the reads in an
+SFF file by name. SFF files can include an index within the file, which can
+be read in making this very fast. If the index is missing (or in a format not
+yet supported in Biopython) the file is indexed by scanning all the reads -
+which is a little slower. For example,
+
+    >>> from Bio import SeqIO
+    >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff")
+    >>> record = reads["E3MFGYR02JHD4H"]
+    >>> print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
+    E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
+    >>> reads.close()
+
+Or, using the trimmed reads:
+
+    >>> from Bio import SeqIO
+    >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim")
+    >>> record = reads["E3MFGYR02JHD4H"]
+    >>> print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
+    E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
+    >>> reads.close()
+
+You can also use the Bio.SeqIO.write() function with the "sff" format. Note
+that this requires all the flow information etc, and thus is probably only
+useful for SeqRecord objects originally from reading another SFF file (and
+not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim").
+
+As an example, let's pretend this example SFF file represents some DNA which
+was pre-amplified with a PCR primers AAAGANNNNN. The following script would
+produce a sub-file containing all those reads whose post-quality clipping
+region (i.e. the sequence after trimming) starts with AAAGA exactly (the non-
+degenerate bit of this pretend primer):
+
+    >>> from Bio import SeqIO
+    >>> records = (record for record in
+    ...            SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
+    ...            if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA"))
+    ...
+    >>> count = SeqIO.write(records, "temp_filtered.sff", "sff")
+    >>> print("Selected %i records" % count)
+    Selected 2 records
+
+Of course, for an assembly you would probably want to remove these primers.
+If you want FASTA or FASTQ output, you could just slice the SeqRecord. However,
+if you want SFF output we have to preserve all the flow information - the trick
+is just to adjust the left clip position!
+
+    >>> from Bio import SeqIO
+    >>> def filter_and_trim(records, primer):
+    ...     for record in records:
+    ...         if record.seq[record.annotations["clip_qual_left"]:].startswith(primer):
+    ...             record.annotations["clip_qual_left"] += len(primer)
+    ...             yield record
+    ...
+    >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
+    >>> count = SeqIO.write(filter_and_trim(records, "AAAGA"),
+    ...                     "temp_filtered.sff", "sff")
+    ...
+    >>> print("Selected %i records" % count)
+    Selected 2 records
+
+We can check the results, note the lower case clipped region now includes the "AAAGA"
+sequence:
+
+    >>> for record in SeqIO.parse("temp_filtered.sff", "sff"):
+    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
+    ...
+    E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC...
+    E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA...
+    >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"):
+    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
+    ...
+    E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG...
+    E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG...
+    >>> import os
+    >>> os.remove("temp_filtered.sff")
+
+For a description of the file format, please see the Roche manuals and:
+http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats
+
+"""
+import re
+import struct
+
+from Bio import StreamModeError
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+_null = b"\0"
+_sff = b".sff"
+_hsh = b".hsh"
+_srt = b".srt"
+_mft = b".mft"
+_flag = b"\xff"
+
+
+def _sff_file_header(handle):
+    """Read in an SFF file header (PRIVATE).
+
+    Assumes the handle is at the start of the file, will read forwards
+    though the header and leave the handle pointing at the first record.
+    Returns a tuple of values from the header (header_length, index_offset,
+    index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)
+
+    >>> with open("Roche/greek.sff", "rb") as handle:
+    ...     values = _sff_file_header(handle)
+    ...
+    >>> print(values[0])
+    840
+    >>> print(values[1])
+    65040
+    >>> print(values[2])
+    256
+    >>> print(values[3])
+    24
+    >>> print(values[4])
+    800
+    >>> values[-1]
+    'TCAG'
+
+    """
+    # file header (part one)
+    # use big endiean encdoing   >
+    # magic_number               I
+    # version                    4B
+    # index_offset               Q
+    # index_length               I
+    # number_of_reads            I
+    # header_length              H
+    # key_length                 H
+    # number_of_flows_per_read   H
+    # flowgram_format_code       B
+    # [rest of file header depends on the number of flows and how many keys]
+    fmt = ">4s4BQIIHHHB"
+    assert 31 == struct.calcsize(fmt)
+    data = handle.read(31)
+    if not data:
+        raise ValueError("Empty file.")
+    elif len(data) < 31:
+        raise ValueError("File too small to hold a valid SFF header.")
+    try:
+        (
+            magic_number,
+            ver0,
+            ver1,
+            ver2,
+            ver3,
+            index_offset,
+            index_length,
+            number_of_reads,
+            header_length,
+            key_length,
+            number_of_flows_per_read,
+            flowgram_format,
+        ) = struct.unpack(fmt, data)
+    except TypeError:
+        raise StreamModeError("SFF files must be opened in binary mode.") from None
+    if magic_number in [_hsh, _srt, _mft]:
+        # Probably user error, calling Bio.SeqIO.parse() twice!
+        raise ValueError("Handle seems to be at SFF index block, not start")
+    if magic_number != _sff:  # 779314790
+        raise ValueError("SFF file did not start '.sff', but %r" % magic_number)
+    if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1):
+        raise ValueError(
+            "Unsupported SFF version in header, %i.%i.%i.%i" % (ver0, ver1, ver2, ver3)
+        )
+    if flowgram_format != 1:
+        raise ValueError("Flowgram format code %i not supported" % flowgram_format)
+    if (index_offset != 0) ^ (index_length != 0):
+        raise ValueError(
+            "Index offset %i but index length %i" % (index_offset, index_length)
+        )
+    flow_chars = handle.read(number_of_flows_per_read).decode("ASCII")
+    key_sequence = handle.read(key_length).decode("ASCII")
+    # According to the spec, the header_length field should be the total number
+    # of bytes required by this set of header fields, and should be equal to
+    # "31 + number_of_flows_per_read + key_length" rounded up to the next value
+    # divisible by 8.
+    assert header_length % 8 == 0
+    padding = header_length - number_of_flows_per_read - key_length - 31
+    assert 0 <= padding < 8, padding
+    if handle.read(padding).count(_null) != padding:
+        import warnings
+        from Bio import BiopythonParserWarning
+
+        warnings.warn(
+            "Your SFF file is invalid, post header %i byte "
+            "null padding region contained data." % padding,
+            BiopythonParserWarning,
+        )
+    return (
+        header_length,
+        index_offset,
+        index_length,
+        number_of_reads,
+        number_of_flows_per_read,
+        flow_chars,
+        key_sequence,
+    )
+
+
+def _sff_do_slow_index(handle):
+    """Generate an index by scanning though all the reads in an SFF file (PRIVATE).
+
+    This is a slow but generic approach if we can't parse the provided index
+    (if present).
+
+    Will use the handle seek/tell functions.
+    """
+    handle.seek(0)
+    (
+        header_length,
+        index_offset,
+        index_length,
+        number_of_reads,
+        number_of_flows_per_read,
+        flow_chars,
+        key_sequence,
+    ) = _sff_file_header(handle)
+    # Now on to the reads...
+    read_header_fmt = ">2HI4H"
+    read_header_size = struct.calcsize(read_header_fmt)
+    # NOTE - assuming flowgram_format==1, which means struct type H
+    read_flow_fmt = ">%iH" % number_of_flows_per_read
+    read_flow_size = struct.calcsize(read_flow_fmt)
+    assert 1 == struct.calcsize(">B")
+    assert 1 == struct.calcsize(">s")
+    assert 1 == struct.calcsize(">c")
+    assert read_header_size % 8 == 0  # Important for padding calc later!
+    for read in range(number_of_reads):
+        record_offset = handle.tell()
+        if record_offset == index_offset:
+            # Found index block within reads, ignore it:
+            offset = index_offset + index_length
+            if offset % 8:
+                offset += 8 - (offset % 8)
+            assert offset % 8 == 0
+            handle.seek(offset)
+            record_offset = offset
+        # assert record_offset%8 == 0 # Worth checking, but slow
+        # First the fixed header
+        data = handle.read(read_header_size)
+        (
+            read_header_length,
+            name_length,
+            seq_len,
+            clip_qual_left,
+            clip_qual_right,
+            clip_adapter_left,
+            clip_adapter_right,
+        ) = struct.unpack(read_header_fmt, data)
+        if read_header_length < 10 or read_header_length % 8 != 0:
+            raise ValueError(
+                "Malformed read header, says length is %i:\n%r"
+                % (read_header_length, data)
+            )
+        # now the name and any padding (remainder of header)
+        name = handle.read(name_length).decode()
+        padding = read_header_length - read_header_size - name_length
+        if handle.read(padding).count(_null) != padding:
+            import warnings
+            from Bio import BiopythonParserWarning
+
+            warnings.warn(
+                "Your SFF file is invalid, post name %i byte "
+                "padding region contained data" % padding,
+                BiopythonParserWarning,
+            )
+        assert record_offset + read_header_length == handle.tell()
+        # now the flowgram values, flowgram index, bases and qualities
+        size = read_flow_size + 3 * seq_len
+        handle.seek(size, 1)
+        # now any padding...
+        padding = size % 8
+        if padding:
+            padding = 8 - padding
+            if handle.read(padding).count(_null) != padding:
+                import warnings
+                from Bio import BiopythonParserWarning
+
+                warnings.warn(
+                    "Your SFF file is invalid, post quality %i "
+                    "byte padding region contained data" % padding,
+                    BiopythonParserWarning,
+                )
+        # print("%s %s %i" % (read, name, record_offset))
+        yield name, record_offset
+    if handle.tell() % 8 != 0:
+        raise ValueError("After scanning reads, did not end on a multiple of 8")
+
+
+def _sff_find_roche_index(handle):
+    """Locate any existing Roche style XML meta data and read index (PRIVATE).
+
+    Makes a number of hard coded assumptions based on reverse engineered SFF
+    files from Roche 454 machines.
+
+    Returns a tuple of read count, SFF "index" offset and size, XML offset
+    and size, and the actual read index offset and size.
+
+    Raises a ValueError for unsupported or non-Roche index blocks.
+    """
+    handle.seek(0)
+    (
+        header_length,
+        index_offset,
+        index_length,
+        number_of_reads,
+        number_of_flows_per_read,
+        flow_chars,
+        key_sequence,
+    ) = _sff_file_header(handle)
+    assert handle.tell() == header_length
+    if not index_offset or not index_length:
+        raise ValueError("No index present in this SFF file")
+    # Now jump to the header...
+    handle.seek(index_offset)
+    fmt = ">4s4B"
+    fmt_size = struct.calcsize(fmt)
+    data = handle.read(fmt_size)
+    if not data:
+        raise ValueError(
+            "Premature end of file? Expected index of size %i at offest %i, found nothing"
+            % (index_length, index_offset)
+        )
+    if len(data) < fmt_size:
+        raise ValueError(
+            "Premature end of file? Expected index of size %i at offest %i, found %r"
+            % (index_length, index_offset, data)
+        )
+    magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
+    if magic_number == _mft:  # 778921588
+        # Roche 454 manifest index
+        # This is typical from raw Roche 454 SFF files (2009), and includes
+        # both an XML manifest and the sorted index.
+        if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
+            # This is "1.00" as a string
+            raise ValueError(
+                "Unsupported version in .mft index header, %i.%i.%i.%i"
+                % (ver0, ver1, ver2, ver3)
+            )
+        fmt2 = ">LL"
+        fmt2_size = struct.calcsize(fmt2)
+        xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size))
+        if index_length != fmt_size + fmt2_size + xml_size + data_size:
+            raise ValueError(
+                "Problem understanding .mft index header, %i != %i + %i + %i + %i"
+                % (index_length, fmt_size, fmt2_size, xml_size, data_size)
+            )
+        return (
+            number_of_reads,
+            header_length,
+            index_offset,
+            index_length,
+            index_offset + fmt_size + fmt2_size,
+            xml_size,
+            index_offset + fmt_size + fmt2_size + xml_size,
+            data_size,
+        )
+    elif magic_number == _srt:  # 779317876
+        # Roche 454 sorted index
+        # I've had this from Roche tool sfffile when the read identifiers
+        # had nonstandard lengths and there was no XML manifest.
+        if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
+            # This is "1.00" as a string
+            raise ValueError(
+                "Unsupported version in .srt index header, %i.%i.%i.%i"
+                % (ver0, ver1, ver2, ver3)
+            )
+        data = handle.read(4)
+        if data != _null * 4:
+            raise ValueError("Did not find expected null four bytes in .srt index")
+        return (
+            number_of_reads,
+            header_length,
+            index_offset,
+            index_length,
+            0,
+            0,
+            index_offset + fmt_size + 4,
+            index_length - fmt_size - 4,
+        )
+    elif magic_number == _hsh:
+        raise ValueError(
+            "Hash table style indexes (.hsh) in SFF files are not (yet) supported"
+        )
+    else:
+        raise ValueError(
+            "Unknown magic number %r in SFF index header:\n%r" % (magic_number, data)
+        )
+
+
+def ReadRocheXmlManifest(handle):
+    """Read any Roche style XML manifest data in the SFF "index".
+
+    The SFF file format allows for multiple different index blocks, and Roche
+    took advantage of this to define their own index block which also embeds
+    an XML manifest string. This is not a publicly documented extension to
+    the SFF file format, this was reverse engineered.
+
+    The handle should be to an SFF file opened in binary mode. This function
+    will use the handle seek/tell functions and leave the handle in an
+    arbitrary location.
+
+    Any XML manifest found is returned as a Python string, which you can then
+    parse as appropriate, or reuse when writing out SFF files with the
+    SffWriter class.
+
+    Returns a string, or raises a ValueError if an Roche manifest could not be
+    found.
+    """
+    (
+        number_of_reads,
+        header_length,
+        index_offset,
+        index_length,
+        xml_offset,
+        xml_size,
+        read_index_offset,
+        read_index_size,
+    ) = _sff_find_roche_index(handle)
+    if not xml_offset or not xml_size:
+        raise ValueError("No XML manifest found")
+    handle.seek(xml_offset)
+    return handle.read(xml_size).decode()
+
+
+# This is a generator function!
+def _sff_read_roche_index(handle):
+    """Read any existing Roche style read index provided in the SFF file (PRIVATE).
+
+    Will use the handle seek/tell functions.
+
+    This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.
+
+    Roche SFF indices use base 255 not 256, meaning we see bytes in range the
+    range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
+    can be used as a marker character to separate entries (required if the
+    read name lengths vary).
+
+    Note that since only four bytes are used for the read offset, this is
+    limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
+    tool to combine SFF files beyound this limit, they issue a warning and
+    omit the index (and manifest).
+    """
+    (
+        number_of_reads,
+        header_length,
+        index_offset,
+        index_length,
+        xml_offset,
+        xml_size,
+        read_index_offset,
+        read_index_size,
+    ) = _sff_find_roche_index(handle)
+    # Now parse the read index...
+    handle.seek(read_index_offset)
+    fmt = ">5B"
+    for read in range(number_of_reads):
+        # TODO - Be more aware of when the index should end?
+        data = handle.read(6)
+        while True:
+            more = handle.read(1)
+            if not more:
+                raise ValueError("Premature end of file!")
+            data += more
+            if more == _flag:
+                break
+        assert data[-1:] == _flag, data[-1:]
+        name = data[:-6].decode()
+        off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1])
+        offset = off0 + 255 * off1 + 65025 * off2 + 16581375 * off3
+        if off4:
+            # Could in theory be used as a fifth piece of offset information,
+            # i.e. offset =+ 4228250625L*off4, but testing the Roche tools this
+            # is not the case. They simple don't support such large indexes.
+            raise ValueError("Expected a null terminator to the read name.")
+        yield name, offset
+    if handle.tell() != read_index_offset + read_index_size:
+        raise ValueError(
+            "Problem with index length? %i vs %i"
+            % (handle.tell(), read_index_offset + read_index_size)
+        )
+
+
+_valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$")
+
+
+def _sff_read_seq_record(
+    handle, number_of_flows_per_read, flow_chars, key_sequence, trim=False
+):
+    """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
+    # Now on to the reads...
+    # the read header format (fixed part):
+    # read_header_length     H
+    # name_length            H
+    # seq_len                I
+    # clip_qual_left         H
+    # clip_qual_right        H
+    # clip_adapter_left      H
+    # clip_adapter_right     H
+    # [rest of read header depends on the name length etc]
+    read_header_fmt = ">2HI4H"
+    read_header_size = struct.calcsize(read_header_fmt)
+    read_flow_fmt = ">%iH" % number_of_flows_per_read
+    read_flow_size = struct.calcsize(read_flow_fmt)
+
+    (
+        read_header_length,
+        name_length,
+        seq_len,
+        clip_qual_left,
+        clip_qual_right,
+        clip_adapter_left,
+        clip_adapter_right,
+    ) = struct.unpack(read_header_fmt, handle.read(read_header_size))
+    if clip_qual_left:
+        clip_qual_left -= 1  # python counting
+    if clip_adapter_left:
+        clip_adapter_left -= 1  # python counting
+    if read_header_length < 10 or read_header_length % 8 != 0:
+        raise ValueError(
+            "Malformed read header, says length is %i" % read_header_length
+        )
+    # now the name and any padding (remainder of header)
+    name = handle.read(name_length).decode()
+    padding = read_header_length - read_header_size - name_length
+    if handle.read(padding).count(_null) != padding:
+        import warnings
+        from Bio import BiopythonParserWarning
+
+        warnings.warn(
+            "Your SFF file is invalid, post name %i "
+            "byte padding region contained data" % padding,
+            BiopythonParserWarning,
+        )
+    # now the flowgram values, flowgram index, bases and qualities
+    # NOTE - assuming flowgram_format==1, which means struct type H
+    flow_values = handle.read(read_flow_size)  # unpack later if needed
+    temp_fmt = ">%iB" % seq_len  # used for flow index and quals
+    flow_index = handle.read(seq_len)  # unpack later if needed
+    seq = handle.read(seq_len)  # Leave as bytes for Seq object
+    quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
+    # now any padding...
+    padding = (read_flow_size + seq_len * 3) % 8
+    if padding:
+        padding = 8 - padding
+        if handle.read(padding).count(_null) != padding:
+            import warnings
+            from Bio import BiopythonParserWarning
+
+            warnings.warn(
+                "Your SFF file is invalid, post quality %i "
+                "byte padding region contained data" % padding,
+                BiopythonParserWarning,
+            )
+    # Follow Roche and apply most aggressive of qual and adapter clipping.
+    # Note Roche seems to ignore adapter clip fields when writing SFF,
+    # and uses just the quality clipping values for any clipping.
+    clip_left = max(clip_qual_left, clip_adapter_left)
+    # Right clipping of zero means no clipping
+    if clip_qual_right:
+        if clip_adapter_right:
+            clip_right = min(clip_qual_right, clip_adapter_right)
+        else:
+            # Typical case with Roche SFF files
+            clip_right = clip_qual_right
+    elif clip_adapter_right:
+        clip_right = clip_adapter_right
+    else:
+        clip_right = seq_len
+    # Now build a SeqRecord
+    if trim:
+        if clip_left >= clip_right:
+            # Raise an error?
+            import warnings
+            from Bio import BiopythonParserWarning
+
+            warnings.warn(
+                "Overlapping clip values in SFF record, trimmed to nothing",
+                BiopythonParserWarning,
+            )
+            seq = ""
+            quals = []
+        else:
+            seq = seq[clip_left:clip_right].upper()
+            quals = quals[clip_left:clip_right]
+        # Don't record the clipping values, flow etc, they make no sense now:
+        annotations = {}
+    else:
+        if clip_left >= clip_right:
+            import warnings
+            from Bio import BiopythonParserWarning
+
+            warnings.warn(
+                "Overlapping clip values in SFF record", BiopythonParserWarning
+            )
+            seq = seq.lower()
+        else:
+            # This use of mixed case mimics the Roche SFF tool's FASTA output
+            seq = (
+                seq[:clip_left].lower()
+                + seq[clip_left:clip_right].upper()
+                + seq[clip_right:].lower()
+            )
+        annotations = {
+            "flow_values": struct.unpack(read_flow_fmt, flow_values),
+            "flow_index": struct.unpack(temp_fmt, flow_index),
+            "flow_chars": flow_chars,
+            "flow_key": key_sequence,
+            "clip_qual_left": clip_qual_left,
+            "clip_qual_right": clip_qual_right,
+            "clip_adapter_left": clip_adapter_left,
+            "clip_adapter_right": clip_adapter_right,
+        }
+    if re.match(_valid_UAN_read_name, name):
+        annotations["time"] = _get_read_time(name)
+        annotations["region"] = _get_read_region(name)
+        annotations["coords"] = _get_read_xy(name)
+    annotations["molecule_type"] = "DNA"
+    record = SeqRecord(
+        Seq(seq), id=name, name=name, description="", annotations=annotations
+    )
+    # Dirty trick to speed up this line:
+    # record.letter_annotations["phred_quality"] = quals
+    dict.__setitem__(record._per_letter_annotations, "phred_quality", quals)
+    # Return the record and then continue...
+    return record
+
+
+_powers_of_36 = [36 ** i for i in range(6)]
+
+
+def _string_as_base_36(string):
+    """Interpret a string as a base-36 number as per 454 manual (PRIVATE)."""
+    total = 0
+    for c, power in zip(string[::-1], _powers_of_36):
+        # For reference: ord('0') = 48, ord('9') = 57
+        # For reference: ord('A') = 65, ord('Z') = 90
+        # For reference: ord('a') = 97, ord('z') = 122
+        if 48 <= ord(c) <= 57:
+            val = ord(c) - 22  # equivalent to: - ord('0') + 26
+        elif 65 <= ord(c) <= 90:
+            val = ord(c) - 65
+        elif 97 <= ord(c) <= 122:
+            val = ord(c) - 97
+        else:
+            # Invalid character
+            val = 0
+        total += val * power
+    return total
+
+
+def _get_read_xy(read_name):
+    """Extract coordinates from last 5 characters of read name (PRIVATE)."""
+    number = _string_as_base_36(read_name[9:])
+    return divmod(number, 4096)
+
+
+_time_denominators = [
+    13 * 32 * 24 * 60 * 60,
+    32 * 24 * 60 * 60,
+    24 * 60 * 60,
+    60 * 60,
+    60,
+]
+
+
+def _get_read_time(read_name):
+    """Extract time from first 6 characters of read name (PRIVATE)."""
+    time_list = []
+    remainder = _string_as_base_36(read_name[:6])
+    for denominator in _time_denominators:
+        this_term, remainder = divmod(remainder, denominator)
+        time_list.append(this_term)
+    time_list.append(remainder)
+    time_list[0] += 2000
+    return time_list
+
+
+def _get_read_region(read_name):
+    """Extract region from read name (PRIVATE)."""
+    return int(read_name[8])
+
+
+def _sff_read_raw_record(handle, number_of_flows_per_read):
+    """Extract the next read in the file as a raw (bytes) string (PRIVATE)."""
+    read_header_fmt = ">2HI"
+    read_header_size = struct.calcsize(read_header_fmt)
+    read_flow_fmt = ">%iH" % number_of_flows_per_read
+    read_flow_size = struct.calcsize(read_flow_fmt)
+
+    raw = handle.read(read_header_size)
+    read_header_length, name_length, seq_len = struct.unpack(read_header_fmt, raw)
+    if read_header_length < 10 or read_header_length % 8 != 0:
+        raise ValueError(
+            "Malformed read header, says length is %i" % read_header_length
+        )
+    # now the four clip values (4H = 8 bytes), and read name
+    raw += handle.read(8 + name_length)
+    # and any padding (remainder of header)
+    padding = read_header_length - read_header_size - 8 - name_length
+    pad = handle.read(padding)
+    if pad.count(_null) != padding:
+        import warnings
+        from Bio import BiopythonParserWarning
+
+        warnings.warn(
+            "Your SFF file is invalid, post name %i "
+            "byte padding region contained data" % padding,
+            BiopythonParserWarning,
+        )
+    raw += pad
+    # now the flowgram values, flowgram index, bases and qualities
+    raw += handle.read(read_flow_size + seq_len * 3)
+    padding = (read_flow_size + seq_len * 3) % 8
+    # now any padding...
+    if padding:
+        padding = 8 - padding
+        pad = handle.read(padding)
+        if pad.count(_null) != padding:
+            import warnings
+            from Bio import BiopythonParserWarning
+
+            warnings.warn(
+                "Your SFF file is invalid, post quality %i "
+                "byte padding region contained data" % padding,
+                BiopythonParserWarning,
+            )
+        raw += pad
+    # Return the raw bytes
+    return raw
+
+
+class _AddTellHandle:
+    """Wrapper for handles which do not support the tell method (PRIVATE).
+
+    Intended for use with things like network handles where tell (and reverse
+    seek) are not supported. The SFF file needs to track the current offset in
+    order to deal with the index block.
+    """
+
+    def __init__(self, handle):
+        self._handle = handle
+        self._offset = 0
+
+    def read(self, length):
+        data = self._handle.read(length)
+        self._offset += len(data)
+        return data
+
+    def tell(self):
+        return self._offset
+
+    def seek(self, offset):
+        if offset < self._offset:
+            raise RuntimeError("Can't seek backwards")
+        self._handle.read(offset - self._offset)
+
+    def close(self):
+        return self._handle.close()
+
+
+class SffIterator(SequenceIterator):
+    """Parser for Standard Flowgram Format (SFF) files."""
+
+    def __init__(self, source, alphabet=None, trim=False):
+        """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
+
+            - source - path to an SFF file, e.g. from Roche 454 sequencing,
+              or a file-like object opened in binary mode.
+            - alphabet - optional alphabet, unused. Leave as None.
+            - trim - should the sequences be trimmed?
+
+        The resulting SeqRecord objects should match those from a paired FASTA
+        and QUAL file converted from the SFF file using the Roche 454 tool
+        ssfinfo. i.e. The sequence will be mixed case, with the trim regions
+        shown in lower case.
+
+        This function is used internally via the Bio.SeqIO functions:
+
+        >>> from Bio import SeqIO
+        >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
+        ...     print("%s %i" % (record.id, len(record)))
+        ...
+        E3MFGYR02JWQ7T 265
+        E3MFGYR02JA6IL 271
+        E3MFGYR02JHD4H 310
+        E3MFGYR02GFKUC 299
+        E3MFGYR02FTGED 281
+        E3MFGYR02FR9G7 261
+        E3MFGYR02GAZMS 278
+        E3MFGYR02HHZ8O 221
+        E3MFGYR02GPGB1 269
+        E3MFGYR02F7Z7G 219
+
+        You can also call it directly:
+
+        >>> with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle:
+        ...     for record in SffIterator(handle):
+        ...         print("%s %i" % (record.id, len(record)))
+        ...
+        E3MFGYR02JWQ7T 265
+        E3MFGYR02JA6IL 271
+        E3MFGYR02JHD4H 310
+        E3MFGYR02GFKUC 299
+        E3MFGYR02FTGED 281
+        E3MFGYR02FR9G7 261
+        E3MFGYR02GAZMS 278
+        E3MFGYR02HHZ8O 221
+        E3MFGYR02GPGB1 269
+        E3MFGYR02F7Z7G 219
+
+        Or, with the trim option:
+
+        >>> with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle:
+        ...     for record in SffIterator(handle, trim=True):
+        ...         print("%s %i" % (record.id, len(record)))
+        ...
+        E3MFGYR02JWQ7T 260
+        E3MFGYR02JA6IL 265
+        E3MFGYR02JHD4H 292
+        E3MFGYR02GFKUC 295
+        E3MFGYR02FTGED 277
+        E3MFGYR02FR9G7 256
+        E3MFGYR02GAZMS 271
+        E3MFGYR02HHZ8O 150
+        E3MFGYR02GPGB1 221
+        E3MFGYR02F7Z7G 130
+
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        super().__init__(source, mode="b", fmt="SFF")
+        self.trim = trim
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        try:
+            if 0 != handle.tell():
+                raise ValueError("Not at start of file, offset %i" % handle.tell())
+        except AttributeError:
+            # Probably a network handle or something like that
+            handle = _AddTellHandle(handle)
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        trim = self.trim
+        (
+            header_length,
+            index_offset,
+            index_length,
+            number_of_reads,
+            number_of_flows_per_read,
+            flow_chars,
+            key_sequence,
+        ) = _sff_file_header(handle)
+        # Now on to the reads...
+        # the read header format (fixed part):
+        # read_header_length     H
+        # name_length            H
+        # seq_len                I
+        # clip_qual_left         H
+        # clip_qual_right        H
+        # clip_adapter_left      H
+        # clip_adapter_right     H
+        # [rest of read header depends on the name length etc]
+        read_header_fmt = ">2HI4H"
+        read_header_size = struct.calcsize(read_header_fmt)
+        read_flow_fmt = ">%iH" % number_of_flows_per_read
+        read_flow_size = struct.calcsize(read_flow_fmt)
+        assert 1 == struct.calcsize(">B")
+        assert 1 == struct.calcsize(">s")
+        assert 1 == struct.calcsize(">c")
+        assert read_header_size % 8 == 0  # Important for padding calc later!
+        # The spec allows for the index block to be before or even in the middle
+        # of the reads. We can check that if we keep track of our position
+        # in the file...
+        for read in range(number_of_reads):
+            if index_offset and handle.tell() == index_offset:
+                offset = index_offset + index_length
+                if offset % 8:
+                    offset += 8 - (offset % 8)
+                assert offset % 8 == 0
+                handle.seek(offset)
+                # Now that we've done this, we don't need to do it again. Clear
+                # the index_offset so we can skip extra handle.tell() calls:
+                index_offset = 0
+            yield _sff_read_seq_record(
+                handle, number_of_flows_per_read, flow_chars, key_sequence, trim,
+            )
+        _check_eof(handle, index_offset, index_length)
+
+
+def _check_eof(handle, index_offset, index_length):
+    """Check final padding is OK (8 byte alignment) and file ends (PRIVATE).
+
+    Will attempt to spot apparent SFF file concatenation and give an error.
+
+    Will not attempt to seek, only moves the handle forward.
+    """
+    offset = handle.tell()
+    extra = b""
+    padding = 0
+
+    if index_offset and offset <= index_offset:
+        # Index block then end of file...
+        if offset < index_offset:
+            raise ValueError(
+                "Gap of %i bytes after final record end %i, "
+                "before %i where index starts?"
+                % (index_offset - offset, offset, index_offset)
+            )
+        # Doing read to jump the index rather than a seek
+        # in case this is a network handle or similar
+        handle.read(index_offset + index_length - offset)
+        offset = index_offset + index_length
+        if offset != handle.tell():
+            raise ValueError(
+                "Wanted %i, got %i, index is %i to %i"
+                % (offset, handle.tell(), index_offset, index_offset + index_length)
+            )
+
+    if offset % 8:
+        padding = 8 - (offset % 8)
+        extra = handle.read(padding)
+
+    if padding >= 4 and extra[-4:] == _sff:
+        # Seen this in one user supplied file, should have been
+        # four bytes of null padding but was actually .sff and
+        # the start of a new concatenated SFF file!
+        raise ValueError(
+            "Your SFF file is invalid, post index %i byte "
+            "null padding region ended '.sff' which could "
+            "be the start of a concatenated SFF file? "
+            "See offset %i" % (padding, offset)
+        )
+    if padding and not extra:
+        # TODO - Is this error harmless enough to just ignore?
+        import warnings
+        from Bio import BiopythonParserWarning
+
+        warnings.warn(
+            "Your SFF file is technically invalid as it is missing "
+            "a terminal %i byte null padding region." % padding,
+            BiopythonParserWarning,
+        )
+        return
+    if extra.count(_null) != padding:
+        import warnings
+        from Bio import BiopythonParserWarning
+
+        warnings.warn(
+            "Your SFF file is invalid, post index %i byte "
+            "null padding region contained data: %r" % (padding, extra),
+            BiopythonParserWarning,
+        )
+
+    offset = handle.tell()
+    if offset % 8 != 0:
+        raise ValueError("Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8))
+    # Should now be at the end of the file...
+    extra = handle.read(4)
+    if extra == _sff:
+        raise ValueError(
+            "Additional data at end of SFF file, "
+            "perhaps multiple SFF files concatenated? "
+            "See offset %i" % offset
+        )
+    elif extra:
+        raise ValueError("Additional data at end of SFF file, see offset %i" % offset)
+
+
+class _SffTrimIterator(SffIterator):
+    """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE)."""
+
+    def __init__(self, source):
+        super().__init__(source, trim=True)
+
+
+class SffWriter(SequenceWriter):
+    """SFF file writer."""
+
+    def __init__(self, target, index=True, xml=None):
+        """Initialize an SFF writer object.
+
+        Arguments:
+         - target - Output stream opened in binary mode, or a path to a file.
+         - index - Boolean argument, should we try and write an index?
+         - xml - Optional string argument, xml manifest to be recorded
+           in the index block (see function ReadRocheXmlManifest for
+           reading this data).
+
+        """
+        super().__init__(target, "wb")
+        self._xml = xml
+        if index:
+            self._index = []
+        else:
+            self._index = None
+
+    def write_file(self, records):
+        """Use this to write an entire file containing the given records."""
+        try:
+            self._number_of_reads = len(records)
+        except TypeError:
+            self._number_of_reads = 0  # dummy value
+            if not hasattr(self.handle, "seek") or not hasattr(self.handle, "tell"):
+                raise ValueError(
+                    "A handle with a seek/tell methods is required in order "
+                    "to record the total record count in the file header "
+                    "(once it is known at the end)."
+                ) from None
+        if self._index is not None and not (
+            hasattr(self.handle, "seek") and hasattr(self.handle, "tell")
+        ):
+            import warnings
+
+            warnings.warn(
+                "A handle with a seek/tell methods is required in "
+                "order to record an SFF index."
+            )
+            self._index = None
+        self._index_start = 0
+        self._index_length = 0
+        if not hasattr(records, "next"):
+            records = iter(records)
+        # Get the first record in order to find the flow information
+        # we will need for the header.
+        try:
+            record = next(records)
+        except StopIteration:
+            record = None
+        if record is None:
+            # No records -> empty SFF file (or an error)?
+            # We can't write a header without the flow information.
+            # return 0
+            raise ValueError("Must have at least one sequence")
+        try:
+            self._key_sequence = record.annotations["flow_key"].encode("ASCII")
+            self._flow_chars = record.annotations["flow_chars"].encode("ASCII")
+            self._number_of_flows_per_read = len(self._flow_chars)
+        except KeyError:
+            raise ValueError("Missing SFF flow information") from None
+        self.write_header()
+        self.write_record(record)
+        count = 1
+        for record in records:
+            self.write_record(record)
+            count += 1
+        if self._number_of_reads == 0:
+            # Must go back and record the record count...
+            offset = self.handle.tell()
+            self.handle.seek(0)
+            self._number_of_reads = count
+            self.write_header()
+            self.handle.seek(offset)  # not essential?
+        else:
+            assert count == self._number_of_reads
+        if self._index is not None:
+            self._write_index()
+        return count
+
+    def _write_index(self):
+        assert len(self._index) == self._number_of_reads
+        handle = self.handle
+        self._index.sort()
+        self._index_start = handle.tell()  # need for header
+        # XML...
+        if self._xml is not None:
+            xml = self._xml.encode()
+        else:
+            from Bio import __version__
+
+            xml = "\n" % __version__
+            xml += (
+                "\n"
+            )
+            xml += "\n"
+            xml = xml.encode()
+        xml_len = len(xml)
+        # Write to the file...
+        fmt = ">I4BLL"
+        fmt_size = struct.calcsize(fmt)
+        handle.write(_null * fmt_size + xml)  # fill this later
+        fmt2 = ">6B"
+        assert 6 == struct.calcsize(fmt2)
+        self._index.sort()
+        index_len = 0  # don't know yet!
+        for name, offset in self._index:
+            # Roche files record the offsets using base 255 not 256.
+            # See comments for parsing the index block. There may be a faster
+            # way to code this, but we can't easily use shifts due to odd base
+            off3 = offset
+            off0 = off3 % 255
+            off3 -= off0
+            off1 = off3 % 65025
+            off3 -= off1
+            off2 = off3 % 16581375
+            off3 -= off2
+            if offset != off0 + off1 + off2 + off3:
+                raise RuntimeError(
+                    "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
+                )
+            off3, off2, off1, off0 = (
+                off3 // 16581375,
+                off2 // 65025,
+                off1 // 255,
+                off0,
+            )
+            if not (off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255):
+                raise RuntimeError(
+                    "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
+                )
+            handle.write(name + struct.pack(fmt2, 0, off3, off2, off1, off0, 255))
+            index_len += len(name) + 6
+        # Note any padding in not included:
+        self._index_length = fmt_size + xml_len + index_len  # need for header
+        # Pad out to an 8 byte boundary (although I have noticed some
+        # real Roche SFF files neglect to do this depsite their manual
+        # suggesting this padding should be there):
+        if self._index_length % 8:
+            padding = 8 - (self._index_length % 8)
+            handle.write(_null * padding)
+        else:
+            padding = 0
+        offset = handle.tell()
+        if offset != self._index_start + self._index_length + padding:
+            raise RuntimeError(
+                "%i vs %i + %i + %i"
+                % (offset, self._index_start, self._index_length, padding)
+            )
+        # Must now go back and update the index header with index size...
+        handle.seek(self._index_start)
+        handle.write(
+            struct.pack(
+                fmt,
+                778921588,  # magic number
+                49,
+                46,
+                48,
+                48,  # Roche index version, "1.00"
+                xml_len,
+                index_len,
+            )
+            + xml
+        )
+        # Must now go back and update the header...
+        handle.seek(0)
+        self.write_header()
+        handle.seek(offset)  # not essential?
+
+    def write_header(self):
+        """Write the SFF file header."""
+        # Do header...
+        key_length = len(self._key_sequence)
+        # file header (part one)
+        # use big endiean encdoing   >
+        # magic_number               I
+        # version                    4B
+        # index_offset               Q
+        # index_length               I
+        # number_of_reads            I
+        # header_length              H
+        # key_length                 H
+        # number_of_flows_per_read   H
+        # flowgram_format_code       B
+        # [rest of file header depends on the number of flows and how many keys]
+        fmt = ">I4BQIIHHHB%is%is" % (self._number_of_flows_per_read, key_length)
+        # According to the spec, the header_length field should be the total
+        # number of bytes required by this set of header fields, and should be
+        # equal to "31 + number_of_flows_per_read + key_length" rounded up to
+        # the next value divisible by 8.
+        if struct.calcsize(fmt) % 8 == 0:
+            padding = 0
+        else:
+            padding = 8 - (struct.calcsize(fmt) % 8)
+        header_length = struct.calcsize(fmt) + padding
+        assert header_length % 8 == 0
+        header = struct.pack(
+            fmt,
+            779314790,  # magic number 0x2E736666
+            0,
+            0,
+            0,
+            1,  # version
+            self._index_start,
+            self._index_length,
+            self._number_of_reads,
+            header_length,
+            key_length,
+            self._number_of_flows_per_read,
+            1,  # the only flowgram format code we support
+            self._flow_chars,
+            self._key_sequence,
+        )
+        self.handle.write(header + _null * padding)
+
+    def write_record(self, record):
+        """Write a single additional record to the output file.
+
+        This assumes the header has been done.
+        """
+        # Basics
+        name = record.id.encode()
+        name_len = len(name)
+        seq = bytes(record.seq).upper()
+        seq_len = len(seq)
+        # Qualities
+        try:
+            quals = record.letter_annotations["phred_quality"]
+        except KeyError:
+            raise ValueError(
+                "Missing PHRED qualities information for %s" % record.id
+            ) from None
+        # Flow
+        try:
+            flow_values = record.annotations["flow_values"]
+            flow_index = record.annotations["flow_index"]
+            if (
+                self._key_sequence != record.annotations["flow_key"].encode()
+                or self._flow_chars != record.annotations["flow_chars"].encode()
+            ):
+                raise ValueError("Records have inconsistent SFF flow data")
+        except KeyError:
+            raise ValueError(
+                "Missing SFF flow information for %s" % record.id
+            ) from None
+        except AttributeError:
+            raise ValueError("Header not written yet?") from None
+        # Clipping
+        try:
+            clip_qual_left = record.annotations["clip_qual_left"]
+            if clip_qual_left < 0:
+                raise ValueError("Negative SFF clip_qual_left value for %s" % record.id)
+            if clip_qual_left:
+                clip_qual_left += 1
+            clip_qual_right = record.annotations["clip_qual_right"]
+            if clip_qual_right < 0:
+                raise ValueError(
+                    "Negative SFF clip_qual_right value for %s" % record.id
+                )
+            clip_adapter_left = record.annotations["clip_adapter_left"]
+            if clip_adapter_left < 0:
+                raise ValueError(
+                    "Negative SFF clip_adapter_left value for %s" % record.id
+                )
+            if clip_adapter_left:
+                clip_adapter_left += 1
+            clip_adapter_right = record.annotations["clip_adapter_right"]
+            if clip_adapter_right < 0:
+                raise ValueError(
+                    "Negative SFF clip_adapter_right value for %s" % record.id
+                )
+        except KeyError:
+            raise ValueError(
+                "Missing SFF clipping information for %s" % record.id
+            ) from None
+
+        # Capture information for index
+        if self._index is not None:
+            offset = self.handle.tell()
+            # Check the position of the final record (before sort by name)
+            # Using a four-digit base 255 number, so the upper bound is
+            # 254*(1)+254*(255)+254*(255**2)+254*(255**3) = 4228250624
+            # or equivalently it overflows at 255**4 = 4228250625
+            if offset > 4228250624:
+                import warnings
+
+                warnings.warn(
+                    "Read %s has file offset %i, which is too large "
+                    "to store in the Roche SFF index structure. No "
+                    "index block will be recorded." % (name, offset)
+                )
+                # No point recoring the offsets now
+                self._index = None
+            else:
+                self._index.append((name, self.handle.tell()))
+
+        # the read header format (fixed part):
+        # read_header_length     H
+        # name_length            H
+        # seq_len                I
+        # clip_qual_left         H
+        # clip_qual_right        H
+        # clip_adapter_left      H
+        # clip_adapter_right     H
+        # [rest of read header depends on the name length etc]
+        # name
+        # flow values
+        # flow index
+        # sequence
+        # padding
+        read_header_fmt = ">2HI4H%is" % name_len
+        if struct.calcsize(read_header_fmt) % 8 == 0:
+            padding = 0
+        else:
+            padding = 8 - (struct.calcsize(read_header_fmt) % 8)
+        read_header_length = struct.calcsize(read_header_fmt) + padding
+        assert read_header_length % 8 == 0
+        data = (
+            struct.pack(
+                read_header_fmt,
+                read_header_length,
+                name_len,
+                seq_len,
+                clip_qual_left,
+                clip_qual_right,
+                clip_adapter_left,
+                clip_adapter_right,
+                name,
+            )
+            + _null * padding
+        )
+        assert len(data) == read_header_length
+        # now the flowgram values, flowgram index, bases and qualities
+        # NOTE - assuming flowgram_format==1, which means struct type H
+        read_flow_fmt = ">%iH" % self._number_of_flows_per_read
+        read_flow_size = struct.calcsize(read_flow_fmt)
+        temp_fmt = ">%iB" % seq_len  # used for flow index and quals
+        data += (
+            struct.pack(read_flow_fmt, *flow_values)
+            + struct.pack(temp_fmt, *flow_index)
+            + seq
+            + struct.pack(temp_fmt, *quals)
+        )
+        # now any final padding...
+        padding = (read_flow_size + seq_len * 3) % 8
+        if padding:
+            padding = 8 - padding
+        self.handle.write(data + _null * padding)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/SnapGeneIO.py b/code/lib/Bio/SeqIO/SnapGeneIO.py
new file mode 100644
index 0000000..5c670ab
--- /dev/null
+++ b/code/lib/Bio/SeqIO/SnapGeneIO.py
@@ -0,0 +1,296 @@
+# Copyright 2017-2019 Damien Goutte-Gattat.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the SnapGene file format.
+
+The SnapGene binary format is the native format used by the SnapGene program
+from GSL Biotech LLC.
+"""
+from datetime import datetime
+from re import sub
+from struct import unpack
+from xml.dom.minidom import parseString
+
+from Bio.Seq import Seq
+from Bio.SeqFeature import FeatureLocation
+from Bio.SeqFeature import SeqFeature
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+
+
+def _iterate(handle):
+    """Iterate over the packets of a SnapGene file.
+
+    A SnapGene file is made of packets, each packet being a TLV-like
+    structure comprising:
+
+      - 1 single byte indicating the packet's type;
+      - 1 big-endian long integer (4 bytes) indicating the length of the
+        packet's data;
+      - the actual data.
+    """
+    while True:
+        packet_type = handle.read(1)
+        if len(packet_type) < 1:  # No more packet
+            return
+        packet_type = unpack(">B", packet_type)[0]
+
+        length = handle.read(4)
+        if len(length) < 4:
+            raise ValueError("Unexpected end of packet")
+        length = unpack(">I", length)[0]
+
+        data = handle.read(length)
+        if len(data) < length:
+            raise ValueError("Unexpected end of packet")
+
+        yield (packet_type, length, data)
+
+
+def _parse_dna_packet(length, data, record):
+    """Parse a DNA sequence packet.
+
+    A DNA sequence packet contains a single byte flag followed by the
+    sequence itself.
+    """
+    if record.seq:
+        raise ValueError("The file contains more than one DNA packet")
+
+    flags, sequence = unpack(">B%ds" % (length - 1), data)
+    record.seq = Seq(sequence.decode("ASCII"))
+    record.annotations["molecule_type"] = "DNA"
+    if flags & 0x01:
+        record.annotations["topology"] = "circular"
+    else:
+        record.annotations["topology"] = "linear"
+
+
+def _parse_notes_packet(length, data, record):
+    """Parse a 'Notes' packet.
+
+    This type of packet contains some metadata about the sequence. They
+    are stored as a XML string with a 'Notes' root node.
+    """
+    xml = parseString(data.decode("UTF-8"))
+    type = _get_child_value(xml, "Type")
+    if type == "Synthetic":
+        record.annotations["data_file_division"] = "SYN"
+    else:
+        record.annotations["data_file_division"] = "UNC"
+
+    date = _get_child_value(xml, "LastModified")
+    if date:
+        record.annotations["date"] = datetime.strptime(date, "%Y.%m.%d")
+
+    acc = _get_child_value(xml, "AccessionNumber")
+    if acc:
+        record.id = acc
+
+    comment = _get_child_value(xml, "Comments")
+    if comment:
+        record.name = comment.split(" ", 1)[0]
+        record.description = comment
+        if not acc:
+            record.id = record.name
+
+
+def _parse_cookie_packet(length, data, record):
+    """Parse a SnapGene cookie packet.
+
+    Every SnapGene file starts with a packet of this type. It acts as
+    a magic cookie identifying the file as a SnapGene file.
+    """
+    cookie, seq_type, exp_version, imp_version = unpack(">8sHHH", data)
+    if cookie.decode("ASCII") != "SnapGene":
+        raise ValueError("The file is not a valid SnapGene file")
+
+
+def _parse_location(rangespec, strand, record):
+    start, end = [int(x) for x in rangespec.split("-")]
+    # Account for SnapGene's 1-based coordinates
+    start = start - 1
+    if start > end:
+        # Range wrapping the end of the sequence
+        l1 = FeatureLocation(start, len(record), strand=strand)
+        l2 = FeatureLocation(0, end, strand=strand)
+        location = l1 + l2
+    else:
+        location = FeatureLocation(start, end, strand=strand)
+    return location
+
+
+def _parse_features_packet(length, data, record):
+    """Parse a sequence features packet.
+
+    This packet stores sequence features (except primer binding sites,
+    which are in a dedicated Primers packet). The data is a XML string
+    starting with a 'Features' root node.
+    """
+    xml = parseString(data.decode("UTF-8"))
+    for feature in xml.getElementsByTagName("Feature"):
+        quals = {}
+
+        type = _get_attribute_value(feature, "type", default="misc_feature")
+
+        strand = +1
+        directionality = int(
+            _get_attribute_value(feature, "directionality", default="1")
+        )
+        if directionality == 2:
+            strand = -1
+
+        location = None
+        for segment in feature.getElementsByTagName("Segment"):
+            rng = _get_attribute_value(segment, "range")
+            if not location:
+                location = _parse_location(rng, strand, record)
+            else:
+                location = location + _parse_location(rng, strand, record)
+        if not location:
+            raise ValueError("Missing feature location")
+
+        for qualifier in feature.getElementsByTagName("Q"):
+            qname = _get_attribute_value(
+                qualifier, "name", error="Missing qualifier name"
+            )
+            qvalues = []
+            for value in qualifier.getElementsByTagName("V"):
+                if value.hasAttribute("text"):
+                    qvalues.append(_decode(value.attributes["text"].value))
+                elif value.hasAttribute("predef"):
+                    qvalues.append(_decode(value.attributes["predef"].value))
+                elif value.hasAttribute("int"):
+                    qvalues.append(int(value.attributes["int"].value))
+            quals[qname] = qvalues
+
+        name = _get_attribute_value(feature, "name")
+        if name:
+            if "label" not in quals:
+                # No explicit label attribute, use the SnapGene name
+                quals["label"] = [name]
+            elif name not in quals["label"]:
+                # The SnapGene name is different from the label,
+                # add a specific attribute to represent it
+                quals["name"] = [name]
+
+        feature = SeqFeature(location, type=type, qualifiers=quals)
+        record.features.append(feature)
+
+
+def _parse_primers_packet(length, data, record):
+    """Parse a Primers packet.
+
+    A Primers packet is similar to a Features packet but specifically
+    stores primer binding features. The data is a XML string starting
+    with a 'Primers' root node.
+    """
+    xml = parseString(data.decode("UTF-8"))
+    for primer in xml.getElementsByTagName("Primer"):
+        quals = {}
+
+        name = _get_attribute_value(primer, "name")
+        if name:
+            quals["label"] = [name]
+
+        for site in primer.getElementsByTagName("BindingSite"):
+            rng = _get_attribute_value(
+                site, "location", error="Missing binding site location"
+            )
+            strand = int(_get_attribute_value(site, "boundStrand", default="0"))
+            if strand == 1:
+                strand = -1
+            else:
+                strand = +1
+
+            feature = SeqFeature(
+                _parse_location(rng, strand, record),
+                type="primer_bind",
+                qualifiers=quals,
+            )
+            record.features.append(feature)
+
+
+_packet_handlers = {
+    0x00: _parse_dna_packet,
+    0x05: _parse_primers_packet,
+    0x06: _parse_notes_packet,
+    0x0A: _parse_features_packet,
+}
+
+
+# Helper functions to process the XML data in
+# some of the segments
+
+
+def _decode(text):
+    # Get rid of HTML tags in some values
+    return sub("<[^>]+>", "", text)
+
+
+def _get_attribute_value(node, name, default=None, error=None):
+    if node.hasAttribute(name):
+        return _decode(node.attributes[name].value)
+    elif error:
+        raise ValueError(error)
+    else:
+        return default
+
+
+def _get_child_value(node, name, default=None, error=None):
+    children = node.getElementsByTagName(name)
+    if (
+        children
+        and children[0].childNodes
+        and children[0].firstChild.nodeType == node.TEXT_NODE
+    ):
+        return _decode(children[0].firstChild.data)
+    elif error:
+        raise ValueError(error)
+    else:
+        return default
+
+
+class SnapGeneIterator(SequenceIterator):
+    """Parser for SnapGene files."""
+
+    def __init__(self, source):
+        """Parse a SnapGene file and return a SeqRecord object.
+
+        Argument source is a file-like object or a path to a file.
+
+        Note that a SnapGene file can only contain one sequence, so this
+        iterator will always return a single record.
+        """
+        super().__init__(source, mode="b", fmt="SnapGene")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Iterate over the records in the SnapGene file."""
+        record = SeqRecord(None)
+        packets = _iterate(handle)
+        try:
+            packet_type, length, data = next(packets)
+        except StopIteration:
+            raise ValueError("Empty file.") from None
+
+        if packet_type != 0x09:
+            raise ValueError("The file does not start with a SnapGene cookie packet")
+        _parse_cookie_packet(length, data, record)
+
+        for (packet_type, length, data) in packets:
+            handler = _packet_handlers.get(packet_type)
+            if handler is not None:
+                handler(length, data, record)
+
+        if not record.seq:
+            raise ValueError("No DNA packet in file")
+
+        yield record
diff --git a/code/lib/Bio/SeqIO/SwissIO.py b/code/lib/Bio/SeqIO/SwissIO.py
new file mode 100644
index 0000000..8362451
--- /dev/null
+++ b/code/lib/Bio/SeqIO/SwissIO.py
@@ -0,0 +1,142 @@
+# Copyright 2006-2013,2020 by Peter Cock.
+# Revisions copyright 2008-2009 by Michiel de Hoon.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.
+
+You are expected to use this module via the Bio.SeqIO functions.
+See also the Bio.SwissProt module which offers more than just accessing
+the sequences as SeqRecord objects.
+
+See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format.
+"""
+from Bio import SeqFeature
+from Bio import SwissProt
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+def _make_position(location_string, offset=0):
+    """Turn a Swiss location position into a SeqFeature position object (PRIVATE).
+
+    An offset of -1 is used with a start location to make it pythonic.
+    """
+    if location_string == "?":
+        return SeqFeature.UnknownPosition()
+    # Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0.
+    try:
+        return SeqFeature.ExactPosition(max(0, offset + int(location_string)))
+    except ValueError:
+        pass
+    if location_string.startswith("<"):
+        try:
+            return SeqFeature.BeforePosition(max(0, offset + int(location_string[1:])))
+        except ValueError:
+            pass
+    elif location_string.startswith(">"):  # e.g. ">13"
+        try:
+            return SeqFeature.AfterPosition(max(0, offset + int(location_string[1:])))
+        except ValueError:
+            pass
+    elif location_string.startswith("?"):  # e.g. "?22"
+        try:
+            return SeqFeature.UncertainPosition(
+                max(0, offset + int(location_string[1:]))
+            )
+        except ValueError:
+            pass
+    raise NotImplementedError("Cannot parse location '%s'" % location_string)
+
+
+def SwissIterator(source):
+    """Break up a Swiss-Prot/UniProt file into SeqRecord objects.
+
+    Argument source is a file-like object or a path to a file.
+
+    Every section from the ID line to the terminating // becomes
+    a single SeqRecord with associated annotation and features.
+
+    This parser is for the flat file "swiss" format as used by:
+     - Swiss-Prot aka SwissProt
+     - TrEMBL
+     - UniProtKB aka UniProt Knowledgebase
+
+    For consistency with BioPerl and EMBOSS we call this the "swiss"
+    format. See also the SeqIO support for "uniprot-xml" format.
+
+    Rather than calling it directly, you are expected to use this
+    parser via Bio.SeqIO.parse(..., format="swiss") instead.
+    """
+    swiss_records = SwissProt.parse(source)
+
+    for swiss_record in swiss_records:
+        # Convert the SwissProt record to a SeqRecord
+        record = SeqRecord(
+            Seq(swiss_record.sequence),
+            id=swiss_record.accessions[0],
+            name=swiss_record.entry_name,
+            description=swiss_record.description,
+            features=swiss_record.features,
+        )
+        for cross_reference in swiss_record.cross_references:
+            if len(cross_reference) < 2:
+                continue
+            database, accession = cross_reference[:2]
+            dbxref = "%s:%s" % (database, accession)
+            if dbxref not in record.dbxrefs:
+                record.dbxrefs.append(dbxref)
+        annotations = record.annotations
+        annotations["molecule_type"] = "protein"
+        annotations["accessions"] = swiss_record.accessions
+        if swiss_record.protein_existence:
+            annotations["protein_existence"] = swiss_record.protein_existence
+        if swiss_record.created:
+            date, version = swiss_record.created
+            annotations["date"] = date
+            annotations["sequence_version"] = version
+        if swiss_record.sequence_update:
+            date, version = swiss_record.sequence_update
+            annotations["date_last_sequence_update"] = date
+            annotations["sequence_version"] = version
+        if swiss_record.annotation_update:
+            date, version = swiss_record.annotation_update
+            annotations["date_last_annotation_update"] = date
+            annotations["entry_version"] = version
+        if swiss_record.gene_name:
+            annotations["gene_name"] = swiss_record.gene_name
+        annotations["organism"] = swiss_record.organism.rstrip(".")
+        annotations["taxonomy"] = swiss_record.organism_classification
+        annotations["ncbi_taxid"] = swiss_record.taxonomy_id
+        if swiss_record.host_organism:
+            annotations["organism_host"] = swiss_record.host_organism
+        if swiss_record.host_taxonomy_id:
+            annotations["host_ncbi_taxid"] = swiss_record.host_taxonomy_id
+        if swiss_record.comments:
+            annotations["comment"] = "\n".join(swiss_record.comments)
+        if swiss_record.references:
+            annotations["references"] = []
+            for reference in swiss_record.references:
+                feature = SeqFeature.Reference()
+                feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments)
+                for key, value in reference.references:
+                    if key == "PubMed":
+                        feature.pubmed_id = value
+                    elif key == "MEDLINE":
+                        feature.medline_id = value
+                    elif key == "DOI":
+                        pass
+                    elif key == "AGRICOLA":
+                        pass
+                    else:
+                        raise ValueError("Unknown key %s found in references" % key)
+                feature.authors = reference.authors
+                feature.title = reference.title
+                feature.journal = reference.location
+                annotations["references"].append(feature)
+        if swiss_record.keywords:
+            record.annotations["keywords"] = swiss_record.keywords
+        yield record
diff --git a/code/lib/Bio/SeqIO/TabIO.py b/code/lib/Bio/SeqIO/TabIO.py
new file mode 100644
index 0000000..2770d90
--- /dev/null
+++ b/code/lib/Bio/SeqIO/TabIO.py
@@ -0,0 +1,139 @@
+# Copyright 2008-2017,2020 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "tab" (simple tab separated) file format.
+
+You are expected to use this module via the Bio.SeqIO functions.
+
+The "tab" format is an ad-hoc plain text file format where each sequence is
+on one (long) line.  Each line contains the identifier/description, followed
+by a tab, followed by the sequence.  For example, consider the following
+short FASTA format file::
+
+    >ID123456 possible binding site?
+    CATCNAGATGACACTACGACTACGACTCAGACTAC
+    >ID123457 random sequence
+    ACACTACGACTACGACTCAGACTACAAN
+
+Apart from the descriptions, this can be represented in the simple two column
+tab separated format as follows::
+
+    ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC
+    ID123457(tab)ACACTACGACTACGACTCAGACTACAAN
+
+When reading this file, "ID123456" or "ID123457" will be taken as the record's
+.id and .name property.  There is no other information to record.
+
+Similarly, when writing to this format, Biopython will ONLY record the record's
+.id and .seq (and not the description or any other information) as in the
+example above.
+"""
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import _clean
+from .Interfaces import _get_seq_string
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+class TabIterator(SequenceIterator):
+    """Parser for tab-delimited files."""
+
+    def __init__(self, source):
+        """Iterate over tab separated lines as SeqRecord objects.
+
+        Each line of the file should contain one tab only, dividing the line
+        into an identifier and the full sequence.
+
+        Arguments:
+         - source - file-like object opened in text mode, or a path to a file
+
+        The first field is taken as the record's .id and .name (regardless of
+        any spaces within the text) and the second field is the sequence.
+
+        Any blank lines are ignored.
+
+        Examples
+        --------
+        >>> with open("GenBank/NC_005816.tsv") as handle:
+        ...     for record in TabIterator(handle):
+        ...         print("%s length %i" % (record.id, len(record)))
+        gi|45478712|ref|NP_995567.1| length 340
+        gi|45478713|ref|NP_995568.1| length 260
+        gi|45478714|ref|NP_995569.1| length 64
+        gi|45478715|ref|NP_995570.1| length 123
+        gi|45478716|ref|NP_995571.1| length 145
+        gi|45478717|ref|NP_995572.1| length 357
+        gi|45478718|ref|NP_995573.1| length 138
+        gi|45478719|ref|NP_995574.1| length 312
+        gi|45478720|ref|NP_995575.1| length 99
+        gi|45478721|ref|NP_995576.1| length 90
+
+        """
+        super().__init__(source, mode="t", fmt="Tab-separated plain-text")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        for line in handle:
+            try:
+                title, seq = line.split("\t")  # will fail if more than one tab!
+            except ValueError:
+                if line.strip() == "":
+                    # It's a blank line, ignore it
+                    continue
+                raise ValueError(
+                    "Each line should have one tab separating the"
+                    + " title and sequence, this line has %i tabs: %r"
+                    % (line.count("\t"), line)
+                ) from None
+            title = title.strip()
+            seq = seq.strip()  # removes the trailing new line
+            yield SeqRecord(Seq(seq), id=title, name=title, description="")
+
+
+class TabWriter(SequenceWriter):
+    """Class to write simple tab separated format files.
+
+    Each line consists of "id(tab)sequence" only.
+
+    Any description, name or other annotation is not recorded.
+
+    This class is not intended to be used directly. Instead, please use
+    the function ``as_tab``, or the top level ``Bio.SeqIO.write()`` function
+    with ``format="tab"``.
+    """
+
+    def write_record(self, record):
+        """Write a single tab line to the file."""
+        assert self._header_written
+        assert not self._footer_written
+        self._record_written = True
+        self.handle.write(as_tab(record))
+
+
+def as_tab(record):
+    """Return record as tab separated (id(tab)seq) string."""
+    title = _clean(record.id)
+    seq = _get_seq_string(record)  # Catches sequence being None
+    assert "\t" not in title
+    assert "\n" not in title
+    assert "\r" not in title
+    assert "\t" not in seq
+    assert "\n" not in seq
+    assert "\r" not in seq
+    return "%s\t%s\n" % (title, seq)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SeqIO/TwoBitIO.py b/code/lib/Bio/SeqIO/TwoBitIO.py
new file mode 100644
index 0000000..4ad0775
--- /dev/null
+++ b/code/lib/Bio/SeqIO/TwoBitIO.py
@@ -0,0 +1,250 @@
+# Copyright 2020 by Michiel de Hoon
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for UCSC's "twoBit" (.2bit) file format.
+
+This parser reads the index stored in the twoBit file, as well as the masked
+regions and the N's for each sequence. It also creates sequence data objects
+(_TwoBitSequenceData objects), which support only two methods: __len__ and
+__getitem__. The former will return the length of the sequence, while the
+latter returns the sequence (as a bytes object) for the requested region.
+
+Using the information in the index, the __getitem__ method calculates the file
+position at which the requested region starts, and only reads the requested
+sequence region. Note that the full sequence of a record is loaded only if
+specifically requested, making the parser memory-efficient.
+
+The TwoBitIterator object implements the __getitem__, keys, and __len__
+methods that allow it to be used as a dictionary.
+"""
+# The .2bit file format is defined by UCSC as follows
+# (see http://genome.ucsc.edu/FAQ/FAQformat.html#format7):
+#
+#
+# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
+# randomly-accessible format. The file contains masking information as well
+# as the DNA itself.
+#
+# The file begins with a 16-byte header containing the following fields:
+#
+# signature - the number 0x1A412743 in the architecture of the machine that
+#             created the file
+# version - zero for now. Readers should abort if they see a version number
+#           higher than 0
+# sequenceCount - the number of sequences in the file
+# reserved - always zero for now
+#
+# All fields are 32 bits unless noted. If the signature value is not as
+# given, the reader program should byte-swap the signature and check if the
+# swapped version matches. If so, all multiple-byte entities in the file
+# will have to be byte-swapped. This enables these binary files to be used
+# unchanged on different architectures.
+#
+# The header is followed by a file index, which contains one entry for each
+# sequence. Each index entry contains three fields:
+#
+# nameSize - a byte containing the length of the name field
+# name - the sequence name itself (in ASCII-compatible byte string), of
+#        variable length depending on nameSize
+# offset - the 32-bit offset of the sequence data relative to the start of
+#          the file, not aligned to any 4-byte padding boundary
+#
+# The index is followed by the sequence records, which contain nine fields:
+#
+# dnaSize - number of bases of DNA in the sequence
+# nBlockCount - the number of blocks of Ns in the file (representing unknown
+#               sequence)
+# nBlockStarts - an array of length nBlockCount of 32 bit integers
+#                indicating the (0-based) starting position of a block of Ns
+# nBlockSizes - an array of length nBlockCount of 32 bit integers indicating
+#               the length of a block of Ns
+# maskBlockCount - the number of masked (lower-case) blocks
+# maskBlockStarts - an array of length maskBlockCount of 32 bit integers
+#                   indicating the (0-based) starting position of a masked block
+# maskBlockSizes - an array of length maskBlockCount of 32 bit integers
+#                  indicating the length of a masked block
+# reserved - always zero for now
+# packedDna - the DNA packed to two bits per base, represented as so:
+#             T - 00, C - 01, A - 10, G - 11. The first base is in the most
+#             significant 2-bit byte; the last base is in the least significan
+#             2 bits. For example, the sequence TCAG is represented as 00011011.
+import numpy
+
+from Bio.Seq import Seq
+from Bio.Seq import SequenceDataAbstractBaseClass
+from Bio.SeqRecord import SeqRecord
+
+from . import _twoBitIO
+from .Interfaces import SequenceIterator
+
+
+class _TwoBitSequenceData(SequenceDataAbstractBaseClass):
+    """Stores information needed to retrieve sequence data from a .2bit file (PRIVATE).
+
+    Objects of this class store the file position at which the sequence data
+    start, the sequence length, and the start and end position of unknown (N)
+    and masked (lowercase) letters in the sequence.
+
+    Only two methods are provided: __len__ and __getitem__. The former will
+    return the length of the sequence, while the latter returns the sequence
+    (as a bytes object) for the requested region. The full sequence of a record
+    is loaded only if explicitly requested.
+    """
+
+    __slots__ = ("stream", "offset", "length", "nBlocks", "maskBlocks")
+
+    def __init__(self, stream, offset, length):
+        """Initialize the file stream and file position of the sequence data."""
+        self.stream = stream
+        self.offset = offset
+        self.length = length
+        super().__init__()
+
+    def __getitem__(self, key):
+        length = self.length
+        if isinstance(key, slice):
+            start, end, step = key.indices(length)
+            size = len(range(start, end, step))
+            if size == 0:
+                return b""
+        else:
+            if key < 0:
+                key += length
+                if key < 0:
+                    raise IndexError("index out of range")
+            start = key
+            end = key + 1
+            step = 1
+            size = 1
+        byteStart = start // 4
+        byteEnd = (end + 3) // 4
+        byteSize = byteEnd - byteStart
+        stream = self.stream
+        try:
+            stream.seek(self.offset + byteStart)
+        except ValueError as exception:
+            if str(exception) == "seek of closed file":
+                raise ValueError("cannot retrieve sequence: file is closed") from None
+            raise
+        data = numpy.fromfile(stream, dtype="uint8", count=byteSize)
+        sequence = _twoBitIO.convert(
+            data, start, end, step, self.nBlocks, self.maskBlocks
+        )
+        if isinstance(key, slice):
+            return sequence
+        else:  # single nucleotide
+            return ord(sequence)
+
+    def __len__(self):
+        return self.length
+
+    def upper(self):
+        """Remove the sequence mask."""
+        data = _TwoBitSequenceData(self.stream, self.offset, self.length)
+        data.nBlocks = self.nBlocks[:, :]
+        data.maskBlocks = numpy.empty((0, 2), dtype="uint32")
+        return data
+
+    def lower(self):
+        """Extend the sequence mask to the full sequence."""
+        data = _TwoBitSequenceData(self.stream, self.offset, self.length)
+        data.nBlocks = self.nBlocks[:, :]
+        data.maskBlocks = numpy.array([[0, self.length]], dtype="uint32")
+        return data
+
+
+class TwoBitIterator(SequenceIterator):
+    """Parser for UCSC twoBit (.2bit) files."""
+
+    def __init__(self, source):
+        """Read the file index."""
+        super().__init__(source, mode="b", fmt="twoBit")
+        # wait to close the file until the TwoBitIterator goes out of scope:
+        self.should_close_stream = False
+        stream = self.stream
+        data = stream.read(4)
+        if not data:
+            raise ValueError("Empty file.")
+        byteorders = ("little", "big")
+        dtypes = ("u4")
+        for byteorder, dtype in zip(byteorders, dtypes):
+            signature = int.from_bytes(data, byteorder)
+            if signature == 0x1A412743:
+                break
+        else:
+            raise ValueError("Unknown signature")
+        self.byteorder = byteorder
+        data = stream.read(4)
+        version = int.from_bytes(data, byteorder, signed=False)
+        if version == 1:
+            raise ValueError(
+                "version-1 twoBit files with 64-bit offsets for index are currently not supported"
+            )
+        if version != 0:
+            raise ValueError("Found unexpected file version %u; aborting" % version)
+        data = stream.read(4)
+        sequenceCount = int.from_bytes(data, byteorder, signed=False)
+        data = stream.read(4)
+        reserved = int.from_bytes(data, byteorder, signed=False)
+        if reserved != 0:
+            raise ValueError("Found non-zero reserved field; aborting")
+        sequences = {}
+        for i in range(sequenceCount):
+            data = stream.read(1)
+            nameSize = int.from_bytes(data, byteorder, signed=False)
+            data = stream.read(nameSize)
+            name = data.decode("ASCII")
+            data = stream.read(4)
+            offset = int.from_bytes(data, byteorder, signed=False)
+            sequences[name] = (stream, offset)
+        self.sequences = sequences
+        for name, (stream, offset) in sequences.items():
+            stream.seek(offset)
+            data = stream.read(4)
+            dnaSize = int.from_bytes(data, byteorder, signed=False)
+            sequence = _TwoBitSequenceData(stream, offset, dnaSize)
+            data = stream.read(4)
+            nBlockCount = int.from_bytes(data, byteorder, signed=False)
+            nBlockStarts = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
+            nBlockSizes = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
+            sequence.nBlocks = numpy.empty((nBlockCount, 2), dtype="uint32")
+            sequence.nBlocks[:, 0] = nBlockStarts
+            sequence.nBlocks[:, 1] = nBlockStarts + nBlockSizes
+            data = stream.read(4)
+            maskBlockCount = int.from_bytes(data, byteorder, signed=False)
+            maskBlockStarts = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
+            maskBlockSizes = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
+            sequence.maskBlocks = numpy.empty((maskBlockCount, 2), dtype="uint32")
+            sequence.maskBlocks[:, 0] = maskBlockStarts
+            sequence.maskBlocks[:, 1] = maskBlockStarts + maskBlockSizes
+            data = stream.read(4)
+            reserved = int.from_bytes(data, byteorder, signed=False)
+            if reserved != 0:
+                raise ValueError("Found non-zero reserved field %u" % reserved)
+            sequence.offset = stream.tell()
+            sequences[name] = sequence
+
+    def parse(self, stream):
+        """Iterate over the sequences in the file."""
+        for name, sequence in self.sequences.items():
+            sequence = Seq(sequence)
+            record = SeqRecord(sequence, id=name)
+            yield record
+
+    def __getitem__(self, name):
+        try:
+            sequence = self.sequences[name]
+        except ValueError:
+            raise KeyError(name) from None
+        sequence = Seq(sequence)
+        return SeqRecord(sequence, id=name)
+
+    def keys(self):
+        """Return a list with the names of the sequences in the file."""
+        return self.sequences.keys()
+
+    def __len__(self):
+        return len(self.sequences)
diff --git a/code/lib/Bio/SeqIO/UniprotIO.py b/code/lib/Bio/SeqIO/UniprotIO.py
new file mode 100644
index 0000000..50b881d
--- /dev/null
+++ b/code/lib/Bio/SeqIO/UniprotIO.py
@@ -0,0 +1,561 @@
+# Copyright 2010 by Andrea Pierleoni
+# Revisions copyright 2010, 2016 by Peter Cock
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "uniprot-xml" file format.
+
+See Also:
+http://www.uniprot.org
+
+The UniProt XML format essentially replaces the old plain text file format
+originally introduced by SwissProt ("swiss" format in Bio.SeqIO).
+
+"""
+from xml.etree import ElementTree
+from xml.parsers.expat import errors
+
+from Bio import SeqFeature
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+NS = "{http://uniprot.org/uniprot}"
+REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)"
+
+
+def UniprotIterator(source, alphabet=None, return_raw_comments=False):
+    """Iterate over UniProt XML as SeqRecord objects.
+
+    parses an XML entry at a time from any UniProt XML file
+    returns a SeqRecord for each iteration
+
+    This generator can be used in Bio.SeqIO
+
+    Argument source is a file-like object or a path to a file.
+
+    Optional argument alphabet should not be used anymore.
+
+    return_raw_comments = True --> comment fields are returned as complete XML to allow further processing
+    skip_parsing_errors = True --> if parsing errors are found, skip to next entry
+    """
+    if alphabet is not None:
+        raise ValueError("The alphabet argument is no longer supported")
+    try:
+        for event, elem in ElementTree.iterparse(source, events=("start", "end")):
+            if event == "end" and elem.tag == NS + "entry":
+                yield Parser(elem, return_raw_comments=return_raw_comments).parse()
+                elem.clear()
+    except ElementTree.ParseError as exception:
+        if errors.messages[exception.code] == errors.XML_ERROR_NO_ELEMENTS:
+            assert exception.position == (1, 0)  # line 1, column 0
+            raise ValueError("Empty file.") from None
+        else:
+            raise
+
+
+class Parser:
+    """Parse a UniProt XML entry to a SeqRecord.
+
+    Optional argument alphabet is no longer used.
+
+    return_raw_comments=True to get back the complete comment field in XML format
+    """
+
+    def __init__(self, elem, alphabet=None, return_raw_comments=False):
+        """Initialize the class."""
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        self.entry = elem
+        self.return_raw_comments = return_raw_comments
+
+    def parse(self):
+        """Parse the input."""
+        assert self.entry.tag == NS + "entry"
+
+        def append_to_annotations(key, value):
+            if key not in self.ParsedSeqRecord.annotations:
+                self.ParsedSeqRecord.annotations[key] = []
+            if value not in self.ParsedSeqRecord.annotations[key]:
+                self.ParsedSeqRecord.annotations[key].append(value)
+
+        def _parse_name(element):
+            self.ParsedSeqRecord.name = element.text
+            self.ParsedSeqRecord.dbxrefs.append(self.dbname + ":" + element.text)
+
+        def _parse_accession(element):
+            append_to_annotations(
+                "accessions", element.text
+            )  # to cope with SwissProt plain text parser
+            self.ParsedSeqRecord.dbxrefs.append(self.dbname + ":" + element.text)
+
+        def _parse_protein(element):
+            """Parse protein names (PRIVATE)."""
+            descr_set = False
+            for protein_element in element:
+                if protein_element.tag in [
+                    NS + "recommendedName",
+                    NS + "submittedName",
+                    NS + "alternativeName",
+                ]:  # recommendedName tag are parsed before
+                    # use protein fields for name and description
+                    for rec_name in protein_element:
+                        ann_key = "%s_%s" % (
+                            protein_element.tag.replace(NS, ""),
+                            rec_name.tag.replace(NS, ""),
+                        )
+                        append_to_annotations(ann_key, rec_name.text)
+                        if (rec_name.tag == NS + "fullName") and not descr_set:
+                            self.ParsedSeqRecord.description = rec_name.text
+                            descr_set = True
+                elif protein_element.tag == NS + "component":
+                    pass  # not parsed
+                elif protein_element.tag == NS + "domain":
+                    pass  # not parsed
+
+        def _parse_gene(element):
+            for genename_element in element:
+                if "type" in genename_element.attrib:
+                    ann_key = "gene_%s_%s" % (
+                        genename_element.tag.replace(NS, ""),
+                        genename_element.attrib["type"],
+                    )
+                    if genename_element.attrib["type"] == "primary":
+                        self.ParsedSeqRecord.annotations[
+                            ann_key
+                        ] = genename_element.text
+                    else:
+                        append_to_annotations(ann_key, genename_element.text)
+
+        def _parse_geneLocation(element):
+            append_to_annotations("geneLocation", element.attrib["type"])
+
+        def _parse_organism(element):
+            organism_name = com_name = sci_name = ""
+            for organism_element in element:
+                if organism_element.tag == NS + "name":
+                    if organism_element.text:
+                        if organism_element.attrib["type"] == "scientific":
+                            sci_name = organism_element.text
+                        elif organism_element.attrib["type"] == "common":
+                            com_name = organism_element.text
+                        else:
+                            # e.g. synonym
+                            append_to_annotations(
+                                "organism_name", organism_element.text
+                            )
+                elif organism_element.tag == NS + "dbReference":
+                    self.ParsedSeqRecord.dbxrefs.append(
+                        organism_element.attrib["type"]
+                        + ":"
+                        + organism_element.attrib["id"]
+                    )
+                elif organism_element.tag == NS + "lineage":
+                    for taxon_element in organism_element:
+                        if taxon_element.tag == NS + "taxon":
+                            append_to_annotations("taxonomy", taxon_element.text)
+            if sci_name and com_name:
+                organism_name = "%s (%s)" % (sci_name, com_name)
+            elif sci_name:
+                organism_name = sci_name
+            elif com_name:
+                organism_name = com_name
+            self.ParsedSeqRecord.annotations["organism"] = organism_name
+
+        def _parse_organismHost(element):
+            for organism_element in element:
+                if organism_element.tag == NS + "name":
+                    append_to_annotations("organism_host", organism_element.text)
+
+        def _parse_keyword(element):
+            append_to_annotations("keywords", element.text)
+
+        def _parse_comment(element):
+            """Parse comments (PRIVATE).
+
+            Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
+            To store all the contained data, more complex data structures are needed, such as
+            annotated dictionaries. This is left to end user, by optionally setting:
+
+            return_raw_comments=True
+
+            The original XML is returned in the annotation fields.
+
+            Available comment types at december 2009:
+             - "allergen"
+             - "alternative products"
+             - "biotechnology"
+             - "biophysicochemical properties"
+             - "catalytic activity"
+             - "caution"
+             - "cofactor"
+             - "developmental stage"
+             - "disease"
+             - "domain"
+             - "disruption phenotype"
+             - "enzyme regulation"
+             - "function"
+             - "induction"
+             - "miscellaneous"
+             - "pathway"
+             - "pharmaceutical"
+             - "polymorphism"
+             - "PTM"
+             - "RNA editing"
+             - "similarity"
+             - "subcellular location"
+             - "sequence caution"
+             - "subunit"
+             - "tissue specificity"
+             - "toxic dose"
+             - "online information"
+             - "mass spectrometry"
+             - "interaction"
+
+            """
+            simple_comments = [
+                "allergen",
+                "biotechnology",
+                "biophysicochemical properties",
+                "catalytic activity",
+                "caution",
+                "cofactor",
+                "developmental stage",
+                "disease",
+                "domain",
+                "disruption phenotype",
+                "enzyme regulation",
+                "function",
+                "induction",
+                "miscellaneous",
+                "pathway",
+                "pharmaceutical",
+                "polymorphism",
+                "PTM",
+                "RNA editing",  # positions not parsed
+                "similarity",
+                "subunit",
+                "tissue specificity",
+                "toxic dose",
+            ]
+
+            if element.attrib["type"] in simple_comments:
+                ann_key = "comment_%s" % element.attrib["type"].replace(" ", "")
+                for text_element in element.iter(NS + "text"):
+                    if text_element.text:
+                        append_to_annotations(ann_key, text_element.text)
+            elif element.attrib["type"] == "subcellular location":
+                for subloc_element in element.iter(NS + "subcellularLocation"):
+                    for el in subloc_element:
+                        if el.text:
+                            ann_key = "comment_%s_%s" % (
+                                element.attrib["type"].replace(" ", ""),
+                                el.tag.replace(NS, ""),
+                            )
+                            append_to_annotations(ann_key, el.text)
+            elif element.attrib["type"] == "interaction":
+                for interact_element in element.iter(NS + "interactant"):
+                    ann_key = "comment_%s_intactId" % element.attrib["type"]
+                    append_to_annotations(ann_key, interact_element.attrib["intactId"])
+            elif element.attrib["type"] == "alternative products":
+                for alt_element in element.iter(NS + "isoform"):
+                    ann_key = "comment_%s_isoform" % element.attrib["type"].replace(
+                        " ", ""
+                    )
+                    for id_element in alt_element.iter(NS + "id"):
+                        append_to_annotations(ann_key, id_element.text)
+            elif element.attrib["type"] == "mass spectrometry":
+                ann_key = "comment_%s" % element.attrib["type"].replace(" ", "")
+                start = end = 0
+                for el in element.iter(NS + "location"):
+                    pos_els = list(el.iter(NS + "position"))
+                    # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry
+                    try:
+                        if pos_els:
+                            end = int(pos_els[0].attrib["position"])
+                            start = end - 1
+                        else:
+                            start = int(next(el.iter(NS + "begin")).attrib["position"])
+                            start -= 1
+                            end = int(next(el.iter(NS + "end")).attrib["position"])
+                    except (ValueError, KeyError):
+                        # undefined positions or erroneously mapped
+                        pass
+                mass = element.attrib["mass"]
+                method = element.attrib["method"]
+                if start == end == 0:
+                    append_to_annotations(ann_key, "undefined:%s|%s" % (mass, method))
+                else:
+                    append_to_annotations(
+                        ann_key, "%s..%s:%s|%s" % (start, end, mass, method)
+                    )
+            elif element.attrib["type"] == "sequence caution":
+                pass  # not parsed: few information, complex structure
+            elif element.attrib["type"] == "online information":
+                for link_element in element.iter(NS + "link"):
+                    ann_key = "comment_%s" % element.attrib["type"].replace(" ", "")
+                    for id_element in link_element.iter(NS + "link"):
+                        append_to_annotations(
+                            ann_key,
+                            "%s@%s"
+                            % (element.attrib["name"], link_element.attrib["uri"]),
+                        )
+
+            # return raw XML comments if needed
+            if self.return_raw_comments:
+                ann_key = "comment_%s_xml" % element.attrib["type"].replace(" ", "")
+                append_to_annotations(ann_key, ElementTree.tostring(element))
+
+        def _parse_dbReference(element):
+            self.ParsedSeqRecord.dbxrefs.append(
+                element.attrib["type"] + ":" + element.attrib["id"]
+            )
+            # e.g.
+            # 
+            #   
+            #   
+            #   
+            # 
+            if "type" in element.attrib:
+                if element.attrib["type"] == "PDB":
+                    method = ""
+                    resolution = ""
+                    for ref_element in element:
+                        if ref_element.tag == NS + "property":
+                            dat_type = ref_element.attrib["type"]
+                            if dat_type == "method":
+                                method = ref_element.attrib["value"]
+                            if dat_type == "resolution":
+                                resolution = ref_element.attrib["value"]
+                            if dat_type == "chains":
+                                pairs = ref_element.attrib["value"].split(",")
+                                for elem in pairs:
+                                    pair = elem.strip().split("=")
+                                    if pair[1] != "-":
+                                        # TODO - How best to store these, do SeqFeatures make sense?
+                                        feature = SeqFeature.SeqFeature()
+                                        feature.type = element.attrib["type"]
+                                        feature.qualifiers["name"] = element.attrib[
+                                            "id"
+                                        ]
+                                        feature.qualifiers["method"] = method
+                                        feature.qualifiers["resolution"] = resolution
+                                        feature.qualifiers["chains"] = pair[0].split(
+                                            "/"
+                                        )
+                                        start = int(pair[1].split("-")[0]) - 1
+                                        end = int(pair[1].split("-")[1])
+                                        feature.location = SeqFeature.FeatureLocation(
+                                            start, end
+                                        )
+                                        # self.ParsedSeqRecord.features.append(feature)
+
+            for ref_element in element:
+                if ref_element.tag == NS + "property":
+                    pass  # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
+
+        def _parse_reference(element):
+            reference = SeqFeature.Reference()
+            authors = []
+            scopes = []
+            tissues = []
+            journal_name = ""
+            pub_type = ""
+            pub_date = ""
+            for ref_element in element:
+                if ref_element.tag == NS + "citation":
+                    pub_type = ref_element.attrib["type"]
+                    if pub_type == "submission":
+                        pub_type += " to the " + ref_element.attrib["db"]
+                    if "name" in ref_element.attrib:
+                        journal_name = ref_element.attrib["name"]
+                    pub_date = ref_element.attrib.get("date", "")
+                    j_volume = ref_element.attrib.get("volume", "")
+                    j_first = ref_element.attrib.get("first", "")
+                    j_last = ref_element.attrib.get("last", "")
+                    for cit_element in ref_element:
+                        if cit_element.tag == NS + "title":
+                            reference.title = cit_element.text
+                        elif cit_element.tag == NS + "authorList":
+                            for person_element in cit_element:
+                                authors.append(person_element.attrib["name"])
+                        elif cit_element.tag == NS + "dbReference":
+                            self.ParsedSeqRecord.dbxrefs.append(
+                                cit_element.attrib["type"]
+                                + ":"
+                                + cit_element.attrib["id"]
+                            )
+                            if cit_element.attrib["type"] == "PubMed":
+                                reference.pubmed_id = cit_element.attrib["id"]
+                            elif ref_element.attrib["type"] == "MEDLINE":
+                                reference.medline_id = cit_element.attrib["id"]
+                elif ref_element.tag == NS + "scope":
+                    scopes.append(ref_element.text)
+                elif ref_element.tag == NS + "source":
+                    for source_element in ref_element:
+                        if source_element.tag == NS + "tissue":
+                            tissues.append(source_element.text)
+            if scopes:
+                scopes_str = "Scope: " + ", ".join(scopes)
+            else:
+                scopes_str = ""
+            if tissues:
+                tissues_str = "Tissue: " + ", ".join(tissues)
+            else:
+                tissues_str = ""
+
+            # locations cannot be parsed since they are actually written in
+            # free text inside scopes so all the references are put in the
+            # annotation.
+            reference.location = []
+            reference.authors = ", ".join(authors)
+            if journal_name:
+                if pub_date and j_volume and j_first and j_last:
+                    reference.journal = REFERENCE_JOURNAL % {
+                        "name": journal_name,
+                        "volume": j_volume,
+                        "first": j_first,
+                        "last": j_last,
+                        "pub_date": pub_date,
+                    }
+                else:
+                    reference.journal = journal_name
+            reference.comment = " | ".join(
+                (pub_type, pub_date, scopes_str, tissues_str)
+            )
+            append_to_annotations("references", reference)
+
+        def _parse_position(element, offset=0):
+            try:
+                position = int(element.attrib["position"]) + offset
+            except KeyError:
+                position = None
+            status = element.attrib.get("status", "")
+            if status == "unknown":
+                assert position is None
+                return SeqFeature.UnknownPosition()
+            elif not status:
+                return SeqFeature.ExactPosition(position)
+            elif status == "greater than":
+                return SeqFeature.AfterPosition(position)
+            elif status == "less than":
+                return SeqFeature.BeforePosition(position)
+            elif status == "uncertain":
+                return SeqFeature.UncertainPosition(position)
+            else:
+                raise NotImplementedError("Position status %r" % status)
+
+        def _parse_feature(element):
+            feature = SeqFeature.SeqFeature()
+            for k, v in element.attrib.items():
+                feature.qualifiers[k] = v
+            feature.type = element.attrib.get("type", "")
+            if "id" in element.attrib:
+                feature.id = element.attrib["id"]
+            for feature_element in element:
+                if feature_element.tag == NS + "location":
+                    position_elements = feature_element.findall(NS + "position")
+                    if position_elements:
+                        element = position_elements[0]
+                        start_position = _parse_position(element, -1)
+                        end_position = _parse_position(element)
+                    else:
+                        element = feature_element.findall(NS + "begin")[0]
+                        start_position = _parse_position(element, -1)
+                        element = feature_element.findall(NS + "end")[0]
+                        end_position = _parse_position(element)
+                    feature.location = SeqFeature.FeatureLocation(
+                        start_position, end_position
+                    )
+                else:
+                    try:
+                        feature.qualifiers[
+                            feature_element.tag.replace(NS, "")
+                        ] = feature_element.text
+                    except Exception:  # TODO - Which exceptions?
+                        pass  # skip unparsable tag
+            self.ParsedSeqRecord.features.append(feature)
+
+        def _parse_proteinExistence(element):
+            append_to_annotations("proteinExistence", element.attrib["type"])
+
+        def _parse_evidence(element):
+            for k, v in element.attrib.items():
+                ann_key = k
+                append_to_annotations(ann_key, v)
+
+        def _parse_sequence(element):
+            for k, v in element.attrib.items():
+                if k in ("length", "mass", "version"):
+                    self.ParsedSeqRecord.annotations["sequence_%s" % k] = int(v)
+                else:
+                    self.ParsedSeqRecord.annotations["sequence_%s" % k] = v
+            self.ParsedSeqRecord.seq = Seq("".join(element.text.split()))
+            self.ParsedSeqRecord.annotations["molecule_type"] = "protein"
+
+        # ============================================#
+        # Initialize SeqRecord
+        self.ParsedSeqRecord = SeqRecord("", id="")
+
+        # Entry attribs parsing
+        # Unknown dataset should not happen!
+        self.dbname = self.entry.attrib.get("dataset", "UnknownDataset")
+        # add attribs to annotations
+        for k, v in self.entry.attrib.items():
+            if k in ("version"):
+                # original
+                # self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v)
+                # To cope with swissProt plain text parser. this can cause errors
+                # if the attrib has the same name of an other annotation
+                self.ParsedSeqRecord.annotations[k] = int(v)
+            else:
+                # self.ParsedSeqRecord.annotations["entry_%s" % k] = v
+                # to cope with swissProt plain text parser:
+                self.ParsedSeqRecord.annotations[k] = v
+
+        # Top-to-bottom entry children parsing
+        for element in self.entry:
+            if element.tag == NS + "name":
+                _parse_name(element)
+            elif element.tag == NS + "accession":
+                _parse_accession(element)
+            elif element.tag == NS + "protein":
+                _parse_protein(element)
+            elif element.tag == NS + "gene":
+                _parse_gene(element)
+            elif element.tag == NS + "geneLocation":
+                _parse_geneLocation(element)
+            elif element.tag == NS + "organism":
+                _parse_organism(element)
+            elif element.tag == NS + "organismHost":
+                _parse_organismHost(element)
+            elif element.tag == NS + "keyword":
+                _parse_keyword(element)
+            elif element.tag == NS + "comment":
+                _parse_comment(element)
+            elif element.tag == NS + "dbReference":
+                _parse_dbReference(element)
+            elif element.tag == NS + "reference":
+                _parse_reference(element)
+            elif element.tag == NS + "feature":
+                _parse_feature(element)
+            elif element.tag == NS + "proteinExistence":
+                _parse_proteinExistence(element)
+            elif element.tag == NS + "evidence":
+                _parse_evidence(element)
+            elif element.tag == NS + "sequence":
+                _parse_sequence(element)
+            else:
+                pass
+
+        # remove duplicate dbxrefs
+        self.ParsedSeqRecord.dbxrefs = sorted(set(self.ParsedSeqRecord.dbxrefs))
+
+        # use first accession as id
+        if not self.ParsedSeqRecord.id:
+            self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations["accessions"][0]
+
+        return self.ParsedSeqRecord
diff --git a/code/lib/Bio/SeqIO/XdnaIO.py b/code/lib/Bio/SeqIO/XdnaIO.py
new file mode 100644
index 0000000..74ade5e
--- /dev/null
+++ b/code/lib/Bio/SeqIO/XdnaIO.py
@@ -0,0 +1,366 @@
+# Copyright 2017-2019 Damien Goutte-Gattat.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.SeqIO support for the "xdna" file format.
+
+The Xdna binary format is generated by Christian Marck's DNA Strider program
+and also used by Serial Cloner.
+"""
+import warnings
+
+from re import match
+from struct import pack
+from struct import unpack
+
+from Bio import BiopythonWarning
+from Bio.Seq import Seq
+from Bio.SeqFeature import ExactPosition
+from Bio.SeqFeature import FeatureLocation
+from Bio.SeqFeature import SeqFeature
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequenceIterator
+from .Interfaces import SequenceWriter
+
+
+_seq_types = {
+    0: None,
+    1: "DNA",
+    2: "DNA",
+    3: "RNA",
+    4: "protein",
+}
+
+_seq_topologies = {0: "linear", 1: "circular"}
+
+
+def _read(handle, length):
+    """Read the specified number of bytes from the given handle."""
+    data = handle.read(length)
+    if len(data) < length:
+        raise ValueError("Cannot read %d bytes from handle" % length)
+    return data
+
+
+def _read_pstring(handle):
+    """Read a Pascal string.
+
+    A Pascal string comprises a single byte giving the length of the string
+    followed by as many bytes.
+    """
+    length = unpack(">B", _read(handle, 1))[0]
+    return unpack("%ds" % length, _read(handle, length))[0].decode("ASCII")
+
+
+def _read_pstring_as_integer(handle):
+    return int(_read_pstring(handle))
+
+
+def _read_overhang(handle):
+    """Read an overhang specification.
+
+    An overhang is represented in a XDNA file as:
+      - a Pascal string containing the text representation of the overhang
+        length, which also indicates the nature of the overhang:
+        - a length of zero means no overhang,
+        - a negative length means a 3' overhang,
+        - a positive length means a 5' overhang;
+      - the actual overhang sequence.
+
+    Examples:
+      - 0x01 0x30: no overhang ("0", as a P-string)
+      - 0x01 0x32 0x41 0x41: 5' AA overhang (P-string "2", then "AA")
+      - 0x02 0x2D 0x31 0x43: 3' C overhang (P-string "-1", then "C")
+
+    Returns a tuple (length, sequence).
+
+    """
+    length = _read_pstring_as_integer(handle)
+    if length != 0:
+        overhang = _read(handle, abs(length))
+        return (length, overhang)
+    else:
+        return (None, None)
+
+
+def _parse_feature_description(desc, qualifiers):
+    """Parse the description field of a Xdna feature.
+
+    The 'description' field of a feature sometimes contains several
+    GenBank-like qualifiers, separated by carriage returns (CR, 0x0D).
+    """
+    # Split the field's value in CR-separated lines, skipping empty lines
+    for line in [x for x in desc.split("\x0D") if len(x) > 0]:
+        # Is it a qualifier="value" line?
+        m = match('^([^=]+)="([^"]+)"?$', line)
+        if m:
+            # Store the qualifier as provided
+            qual, value = m.groups()
+            qualifiers[qual] = [value]
+        elif '"' not in line:  # Reject ill-formed qualifiers
+            # Store the entire line as a generic note qualifier
+            qualifiers["note"] = [line]
+
+
+def _read_feature(handle, record):
+    """Read a single sequence feature."""
+    name = _read_pstring(handle)
+    desc = _read_pstring(handle)
+    type = _read_pstring(handle) or "misc_feature"
+    start = _read_pstring_as_integer(handle)
+    end = _read_pstring_as_integer(handle)
+
+    # Feature flags (4 bytes):
+    # byte 1 is the strand (0: reverse strand, 1: forward strand);
+    # byte 2 tells whether to display the feature;
+    # byte 4 tells whether to draw an arrow when displaying the feature;
+    # meaning of byte 3 is unknown.
+    (forward, display, arrow) = unpack(">BBxB", _read(handle, 4))
+    if forward:
+        strand = 1
+    else:
+        strand = -1
+        start, end = end, start
+
+    # The last field is a Pascal string usually containing a
+    # comma-separated triplet of numbers ranging from 0 to 255.
+    # I suspect this represents the RGB color to use when displaying
+    # the feature. Skip it as we have no need for it.
+    _read_pstring(handle)
+
+    # Assemble the feature
+    # Shift start by -1 as XDNA feature coordinates are 1-based
+    # while Biopython uses 0-based couting.
+    location = FeatureLocation(start - 1, end, strand=strand)
+    qualifiers = {}
+    if name:
+        qualifiers["label"] = [name]
+    _parse_feature_description(desc, qualifiers)
+    feature = SeqFeature(location, type=type, qualifiers=qualifiers)
+    record.features.append(feature)
+
+
+class XdnaIterator(SequenceIterator):
+    """Parser for Xdna files."""
+
+    def __init__(self, source):
+        """Parse a Xdna file and return a SeqRecord object.
+
+        Argument source is a file-like object in binary mode or a path to a file.
+
+        Note that this is an "iterator" in name only since an Xdna file always
+        contain a single sequence.
+
+        """
+        super().__init__(source, mode="b", fmt="Xdna")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        # Parse fixed-size header and do some rudimentary checks
+        #
+        # The "neg_length" value is the length of the part of the sequence
+        # before the nucleotide considered as the "origin" (nucleotide number 1,
+        # which in DNA Strider is not always the first nucleotide).
+        # Biopython's SeqRecord has no such concept of a sequence origin as far
+        # as I know, so we ignore that value. SerialCloner has no such concept
+        # either and always generates files with a neg_length of zero.
+        header = handle.read(112)
+        if not header:
+            raise ValueError("Empty file.")
+        if len(header) < 112:
+            raise ValueError("Improper header, cannot read 112 bytes from handle")
+        records = self.iterate(handle, header)
+        return records
+
+    def iterate(self, handle, header):
+        """Parse the file and generate SeqRecord objects."""
+        (version, seq_type, topology, length, neg_length, com_length) = unpack(
+            ">BBB25xII60xI12x", header
+        )
+        if version != 0:
+            raise ValueError("Unsupported XDNA version")
+        if seq_type not in _seq_types:
+            raise ValueError("Unknown sequence type")
+        # Read actual sequence and comment found in all XDNA files
+        sequence = _read(handle, length).decode("ASCII")
+        comment = _read(handle, com_length).decode("ASCII")
+
+        # Try to derive a name from the first "word" of the comment
+        name = comment.split(" ")[0]
+
+        # Create record object
+        record = SeqRecord(Seq(sequence), description=comment, name=name, id=name)
+        if _seq_types[seq_type]:
+            record.annotations["molecule_type"] = _seq_types[seq_type]
+
+        if topology in _seq_topologies:
+            record.annotations["topology"] = _seq_topologies[topology]
+
+        if len(handle.read(1)) == 1:
+            # This is an XDNA file with an optional annotation section.
+
+            # Skip the overhangs as I don't know how to represent
+            # them in the SeqRecord model.
+            _read_overhang(handle)  # right-side overhang
+            _read_overhang(handle)  # left-side overhang
+
+            # Read the features
+            num_features = unpack(">B", _read(handle, 1))[0]
+            while num_features > 0:
+                _read_feature(handle, record)
+                num_features -= 1
+
+        yield record
+
+
+class XdnaWriter(SequenceWriter):
+    """Write files in the Xdna format."""
+
+    def __init__(self, target):
+        """Initialize an Xdna writer object.
+
+        Arguments:
+         - target - Output stream opened in binary mode, or a path to a file.
+
+        """
+        super().__init__(target, mode="wb")
+
+    def write_file(self, records):
+        """Write the specified record to a Xdna file.
+
+        Note that the function expects a list (or iterable) of records
+        as per the SequenceWriter interface, but the list should contain
+        only one record as the Xdna format is a mono-record format.
+        """
+        records = iter(records)
+
+        try:
+            record = next(records)
+        except StopIteration:
+            raise ValueError("Must have one sequence") from None
+
+        try:
+            next(records)
+            raise ValueError("More than one sequence found")
+        except StopIteration:
+            pass
+
+        self._has_truncated_strings = False
+
+        molecule_type = record.annotations.get("molecule_type")
+        if molecule_type is None:
+            seqtype = 0
+        elif "DNA" in molecule_type:
+            seqtype = 1
+        elif "RNA" in molecule_type:
+            seqtype = 3
+        elif "protein" in molecule_type:
+            seqtype = 4
+        else:
+            seqtype = 0
+
+        if record.annotations.get("topology", "linear") == "circular":
+            topology = 1
+        else:
+            topology = 0
+
+        # We store the record's id and description in the comment field.
+        # Make sure to avoid duplicating the id if it is already
+        # contained in the description.
+        if record.description.startswith(record.id):
+            comment = record.description
+        else:
+            comment = f"{record.id} {record.description}"
+
+        # Write header
+        self.handle.write(
+            pack(
+                ">BBB25xII60xI11xB",
+                0,  # version
+                seqtype,
+                topology,
+                len(record),
+                0,  # negative length
+                len(comment),
+                255,  # end of header
+            )
+        )
+
+        # Actual sequence and comment
+        self.handle.write(bytes(record.seq))
+        self.handle.write(comment.encode("ASCII"))
+
+        self.handle.write(pack(">B", 0))  # Annotation section marker
+        self._write_pstring("0")  # right-side overhang
+        self._write_pstring("0")  # left-side overhand
+
+        # Write features
+        # We must skip features with fuzzy locations as they cannot be
+        # represented in the Xdna format
+        features = [
+            f
+            for f in record.features
+            if type(f.location.start) == ExactPosition
+            and type(f.location.end) == ExactPosition
+        ]
+        drop = len(record.features) - len(features)
+        if drop > 0:
+            warnings.warn(
+                f"Dropping {drop} features with fuzzy locations", BiopythonWarning
+            )
+
+        # We also cannot store more than 255 features as the number of
+        # features is stored on a single byte...
+        if len(features) > 255:
+            drop = len(features) - 255
+            warnings.warn(
+                f"Too many features, dropping the last {drop}", BiopythonWarning
+            )
+            features = features[:255]
+
+        self.handle.write(pack(">B", len(features)))
+        for feature in features:
+            self._write_pstring(feature.qualifiers.get("label", [""])[0])
+
+            description = ""
+            for qname in feature.qualifiers:
+                if qname in ("label", "translation"):
+                    continue
+
+                for val in feature.qualifiers[qname]:
+                    if len(description) > 0:
+                        description = description + "\x0D"
+                    description = description + '%s="%s"' % (qname, val)
+            self._write_pstring(description)
+
+            self._write_pstring(feature.type)
+
+            start = feature.location.start.position + 1  # 1-based coordinates
+            end = feature.location.end.position
+            strand = 1
+            if feature.location.strand == -1:
+                start, end = end, start
+                strand = 0
+            self._write_pstring(str(start))
+            self._write_pstring(str(end))
+
+            self.handle.write(pack(">BBBB", strand, 1, 0, 1))
+            self._write_pstring("127,127,127")
+
+        if self._has_truncated_strings:
+            warnings.warn(
+                "Some annotations were truncated to 255 characters", BiopythonWarning
+            )
+
+        return 1
+
+    def _write_pstring(self, s):
+        """Write the given string as a Pascal string."""
+        if len(s) > 255:
+            self._has_truncated_strings = True
+            s = s[:255]
+        self.handle.write(pack(">B", len(s)))
+        self.handle.write(s.encode("ASCII"))
diff --git a/code/lib/Bio/SeqIO/__init__.py b/code/lib/Bio/SeqIO/__init__.py
new file mode 100644
index 0000000..7872ffe
--- /dev/null
+++ b/code/lib/Bio/SeqIO/__init__.py
@@ -0,0 +1,1092 @@
+# Copyright 2006-2018 by Peter Cock.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+r"""Sequence input/output as SeqRecord objects.
+
+Bio.SeqIO is also documented at SeqIO_ and by a whole chapter in our tutorial:
+
+  - `HTML Tutorial`_
+  - `PDF Tutorial`_
+
+.. _SeqIO: http://biopython.org/wiki/SeqIO
+.. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
+.. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
+
+Input
+-----
+The main function is Bio.SeqIO.parse(...) which takes an input file handle
+(or in recent versions of Biopython alternatively a filename as a string),
+and format string.  This returns an iterator giving SeqRecord objects:
+
+>>> from Bio import SeqIO
+>>> for record in SeqIO.parse("Fasta/f002", "fasta"):
+...     print("%s %i" % (record.id, len(record)))
+gi|1348912|gb|G26680|G26680 633
+gi|1348917|gb|G26685|G26685 413
+gi|1592936|gb|G29385|G29385 471
+
+Note that the parse() function will invoke the relevant parser for the
+format with its default settings.  You may want more control, in which case
+you need to create a format specific sequence iterator directly.
+
+Some of these parsers are wrappers around low-level parsers which build up
+SeqRecord objects for the consistent SeqIO interface. In cases where the
+run-time is critical, such as large FASTA or FASTQ files, calling these
+underlying parsers will be much faster - in this case these generator
+functions which return tuples of strings:
+
+>>> from Bio.SeqIO.FastaIO import SimpleFastaParser
+>>> from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
+
+Input - Single Records
+----------------------
+If you expect your file to contain one-and-only-one record, then we provide
+the following 'helper' function which will return a single SeqRecord, or
+raise an exception if there are no records or more than one record:
+
+>>> from Bio import SeqIO
+>>> record = SeqIO.read("Fasta/f001", "fasta")
+>>> print("%s %i" % (record.id, len(record)))
+gi|3318709|pdb|1A91| 79
+
+This style is useful when you expect a single record only (and would
+consider multiple records an error).  For example, when dealing with GenBank
+files for bacterial genomes or chromosomes, there is normally only a single
+record.  Alternatively, use this with a handle when downloading a single
+record from the internet.
+
+However, if you just want the first record from a file containing multiple
+record, use the next() function on the iterator:
+
+>>> from Bio import SeqIO
+>>> record = next(SeqIO.parse("Fasta/f002", "fasta"))
+>>> print("%s %i" % (record.id, len(record)))
+gi|1348912|gb|G26680|G26680 633
+
+The above code will work as long as the file contains at least one record.
+Note that if there is more than one record, the remaining records will be
+silently ignored.
+
+
+Input - Multiple Records
+------------------------
+For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records
+using a sequence iterator can save you a lot of memory (RAM).  There is
+less benefit for interlaced file formats (e.g. most multiple alignment file
+formats).  However, an iterator only lets you access the records one by one.
+
+If you want random access to the records by number, turn this into a list:
+
+>>> from Bio import SeqIO
+>>> records = list(SeqIO.parse("Fasta/f002", "fasta"))
+>>> len(records)
+3
+>>> print(records[1].id)
+gi|1348917|gb|G26685|G26685
+
+If you want random access to the records by a key such as the record id,
+turn the iterator into a dictionary:
+
+>>> from Bio import SeqIO
+>>> record_dict = SeqIO.to_dict(SeqIO.parse("Fasta/f002", "fasta"))
+>>> len(record_dict)
+3
+>>> print(len(record_dict["gi|1348917|gb|G26685|G26685"]))
+413
+
+However, using list() or the to_dict() function will load all the records
+into memory at once, and therefore is not possible on very large files.
+Instead, for *some* file formats Bio.SeqIO provides an indexing approach
+providing dictionary like access to any record. For example,
+
+>>> from Bio import SeqIO
+>>> record_dict = SeqIO.index("Fasta/f002", "fasta")
+>>> len(record_dict)
+3
+>>> print(len(record_dict["gi|1348917|gb|G26685|G26685"]))
+413
+>>> record_dict.close()
+
+Many but not all of the supported input file formats can be indexed like
+this. For example "fasta", "fastq", "qual" and even the binary format "sff"
+work, but alignment formats like "phylip", "clustalw" and "nexus" will not.
+
+In most cases you can also use SeqIO.index to get the record from the file
+as a raw string (not a SeqRecord). This can be useful for example to extract
+a sub-set of records from a file where SeqIO cannot output the file format
+(e.g. the plain text SwissProt format, "swiss") or where it is important to
+keep the output 100% identical to the input). For example,
+
+>>> from Bio import SeqIO
+>>> record_dict = SeqIO.index("Fasta/f002", "fasta")
+>>> len(record_dict)
+3
+>>> print(record_dict.get_raw("gi|1348917|gb|G26685|G26685").decode())
+>gi|1348917|gb|G26685|G26685 human STS STS_D11734.
+CGGAGCCAGCGAGCATATGCTGCATGAGGACCTTTCTATCTTACATTATGGCTGGGAATCTTACTCTTTC
+ATCTGATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATCCTGGTTTTACAGATGTGAAACTTTCAA
+GAGATTTACTGACTTTCCTAGAATAGTTTCTCTACTGGAAACCTGATGCTTTTATAAGCCATTGTGATTA
+GGATGACTGTTACAGGCTTAGCTTTGTGTGAAANCCAGTCACCTTTCTCCTAGGTAATGAGTAGTGCTGT
+TCATATTACTNTAAGTTCTATAGCATACTTGCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGG
+AATTGNTTTGCCCTTTTGGGTTTNTTNTTGGTAAANNNTTCCCGGGTGGGGGNGGTNNNGAAA
+
+>>> print(record_dict["gi|1348917|gb|G26685|G26685"].format("fasta"))
+>gi|1348917|gb|G26685|G26685 human STS STS_D11734.
+CGGAGCCAGCGAGCATATGCTGCATGAGGACCTTTCTATCTTACATTATGGCTGGGAATC
+TTACTCTTTCATCTGATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATCCTGGTTT
+TACAGATGTGAAACTTTCAAGAGATTTACTGACTTTCCTAGAATAGTTTCTCTACTGGAA
+ACCTGATGCTTTTATAAGCCATTGTGATTAGGATGACTGTTACAGGCTTAGCTTTGTGTG
+AAANCCAGTCACCTTTCTCCTAGGTAATGAGTAGTGCTGTTCATATTACTNTAAGTTCTA
+TAGCATACTTGCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGGAATTGNTTTG
+CCCTTTTGGGTTTNTTNTTGGTAAANNNTTCCCGGGTGGGGGNGGTNNNGAAA
+
+>>> record_dict.close()
+
+Here the original file and what Biopython would output differ in the line
+wrapping. Also note that the get_raw method will return a bytes object,
+hence the use of decode to turn it into a string.
+
+Also note that the get_raw method will preserve the newline endings. This
+example FASTQ file uses Unix style endings (b"\n" only),
+
+>>> from Bio import SeqIO
+>>> fastq_dict = SeqIO.index("Quality/example.fastq", "fastq")
+>>> len(fastq_dict)
+3
+>>> raw = fastq_dict.get_raw("EAS54_6_R1_2_1_540_792")
+>>> raw.count(b"\n")
+4
+>>> raw.count(b"\r\n")
+0
+>>> b"\r" in raw
+False
+>>> len(raw)
+78
+>>> fastq_dict.close()
+
+Here is the same file but using DOS/Windows new lines (b"\r\n" instead),
+
+>>> from Bio import SeqIO
+>>> fastq_dict = SeqIO.index("Quality/example_dos.fastq", "fastq")
+>>> len(fastq_dict)
+3
+>>> raw = fastq_dict.get_raw("EAS54_6_R1_2_1_540_792")
+>>> raw.count(b"\n")
+4
+>>> raw.count(b"\r\n")
+4
+>>> b"\r\n" in raw
+True
+>>> len(raw)
+82
+>>> fastq_dict.close()
+
+Because this uses two bytes for each new line, the file is longer than
+the Unix equivalent with only one byte.
+
+
+Input - Alignments
+------------------
+You can read in alignment files as alignment objects using Bio.AlignIO.
+Alternatively, reading in an alignment file format via Bio.SeqIO will give
+you a SeqRecord for each row of each alignment:
+
+>>> from Bio import SeqIO
+>>> for record in SeqIO.parse("Clustalw/hedgehog.aln", "clustal"):
+...     print("%s %i" % (record.id, len(record)))
+gi|167877390|gb|EDS40773.1| 447
+gi|167234445|ref|NP_001107837. 447
+gi|74100009|gb|AAZ99217.1| 447
+gi|13990994|dbj|BAA33523.2| 447
+gi|56122354|gb|AAV74328.1| 447
+
+
+Output
+------
+Use the function Bio.SeqIO.write(...), which takes a complete set of
+SeqRecord objects (either as a list, or an iterator), an output file handle
+(or in recent versions of Biopython an output filename as a string) and of
+course the file format::
+
+  from Bio import SeqIO
+  records = ...
+  SeqIO.write(records, "example.faa", "fasta")
+
+Or, using a handle::
+
+    from Bio import SeqIO
+    records = ...
+    with open("example.faa", "w") as handle:
+      SeqIO.write(records, handle, "fasta")
+
+You are expected to call this function once (with all your records) and if
+using a handle, make sure you close it to flush the data to the hard disk.
+
+
+Output - Advanced
+-----------------
+The effect of calling write() multiple times on a single file will vary
+depending on the file format, and is best avoided unless you have a strong
+reason to do so.
+
+If you give a filename, then each time you call write() the existing file
+will be overwritten. For sequential files formats (e.g. fasta, genbank) each
+"record block" holds a single sequence.  For these files it would probably
+be safe to call write() multiple times by re-using the same handle.
+
+However, trying this for certain alignment formats (e.g. phylip, clustal,
+stockholm) would have the effect of concatenating several multiple sequence
+alignments together.  Such files are created by the PHYLIP suite of programs
+for bootstrap analysis, but it is clearer to do this via Bio.AlignIO instead.
+
+Worse, many fileformats have an explicit header and/or footer structure
+(e.g. any XMl format, and most binary file formats like SFF). Here making
+multiple calls to write() will result in an invalid file.
+
+
+Conversion
+----------
+The Bio.SeqIO.convert(...) function allows an easy interface for simple
+file format conversions. Additionally, it may use file format specific
+optimisations so this should be the fastest way too.
+
+In general however, you can combine the Bio.SeqIO.parse(...) function with
+the Bio.SeqIO.write(...) function for sequence file conversion. Using
+generator expressions or generator functions provides a memory efficient way
+to perform filtering or other extra operations as part of the process.
+
+
+File Formats
+------------
+When specifying the file format, use lowercase strings.  The same format
+names are also used in Bio.AlignIO and include the following:
+
+    - abi     - Applied Biosystem's sequencing trace format
+    - abi-trim - Same as "abi" but with quality trimming with Mott's algorithm
+    - ace     - Reads the contig sequences from an ACE assembly file.
+    - cif-atom - Uses Bio.PDB.MMCIFParser to determine the (partial) protein
+      sequence as it appears in the structure based on the atomic coordinates.
+    - cif-seqres - Reads a macromolecular Crystallographic Information File
+      (mmCIF) file to determine the complete protein sequence as defined by the
+      _pdbx_poly_seq_scheme records.
+    - embl    - The EMBL flat file format. Uses Bio.GenBank internally.
+    - fasta   - The generic sequence file format where each record starts with
+      an identifier line starting with a ">" character, followed by
+      lines of sequence.
+    - fasta-2line - Stricter interpretation of the FASTA format using exactly
+      two lines per record (no line wrapping).
+    - fastq   - A "FASTA like" format used by Sanger which also stores PHRED
+      sequence quality values (with an ASCII offset of 33).
+    - fastq-sanger - An alias for "fastq" for consistency with BioPerl and EMBOSS
+    - fastq-solexa - Original Solexa/Illumnia variant of the FASTQ format which
+      encodes Solexa quality scores (not PHRED quality scores) with an
+      ASCII offset of 64.
+    - fastq-illumina - Solexa/Illumina 1.3 to 1.7 variant of the FASTQ format
+      which encodes PHRED quality scores with an ASCII offset of 64
+      (not 33). Note as of version 1.8 of the CASAVA pipeline Illumina
+      will produce FASTQ files using the standard Sanger encoding.
+    - gck     - Gene Construction Kit's format.
+    - genbank - The GenBank or GenPept flat file format.
+    - gb      - An alias for "genbank", for consistency with NCBI Entrez Utilities
+    - ig      - The IntelliGenetics file format, apparently the same as the
+      MASE alignment format.
+    - imgt    - An EMBL like format from IMGT where the feature tables are more
+      indented to allow for longer feature types.
+    - nib     - UCSC's nib file format for nucleotide sequences, which uses one
+      nibble (4 bits) to represent each nucleotide, and stores two nucleotides in
+      one byte.
+    - pdb-seqres -  Reads a Protein Data Bank (PDB) file to determine the
+      complete protein sequence as it appears in the header (no dependencies).
+    - pdb-atom - Uses Bio.PDB to determine the (partial) protein sequence as
+      it appears in the structure based on the atom coordinate section of the
+      file (requires NumPy for Bio.PDB).
+    - phd     - Output from PHRED, used by PHRAP and CONSED for input.
+    - pir     - A "FASTA like" format introduced by the National Biomedical
+      Research Foundation (NBRF) for the Protein Information Resource
+      (PIR) database, now part of UniProt.
+    - seqxml  - SeqXML, simple XML format described in Schmitt et al (2011).
+    - sff     - Standard Flowgram Format (SFF), typical output from Roche 454.
+    - sff-trim - Standard Flowgram Format (SFF) with given trimming applied.
+    - snapgene - SnapGene's native format.
+    - swiss   - Plain text Swiss-Prot aka UniProt format.
+    - tab     - Simple two column tab separated sequence files, where each
+      line holds a record's identifier and sequence. For example,
+      this is used as by Aligent's eArray software when saving
+      microarray probes in a minimal tab delimited text file.
+    - qual    - A "FASTA like" format holding PHRED quality values from
+      sequencing DNA, but no actual sequences (usually provided
+      in separate FASTA files).
+    - uniprot-xml - The UniProt XML format (replacement for the SwissProt plain
+      text format which we call "swiss")
+    - xdna        - DNA Strider's and SerialCloner's native format.
+
+Note that while Bio.SeqIO can read all the above file formats, it cannot
+write to all of them.
+
+You can also use any file format supported by Bio.AlignIO, such as "nexus",
+"phylip" and "stockholm", which gives you access to the individual sequences
+making up each alignment as SeqRecords.
+"""
+# TODO
+# - define policy on reading aligned sequences with more than
+#   one gap character (see also AlignIO)
+#
+# - How best to handle unique/non unique record.id when writing.
+#   For most file formats reading such files is fine; The stockholm
+#   parser would fail.
+#
+# - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
+#   http://www.bioperl.org/wiki/MSF_multiple_alignment_format
+#
+# FAO BioPython Developers
+# ------------------------
+# The way I envision this SeqIO system working as that for any sequence file
+# format we have an iterator that returns SeqRecord objects.
+#
+# This also applies to interlaced file formats (like clustal - although that
+# is now handled via Bio.AlignIO instead) where the file cannot be read record
+# by record.  You should still return an iterator, even if the implementation
+# could just as easily return a list.
+#
+# These file format specific sequence iterators may be implemented as:
+#    - Classes which take a handle for __init__ and provide the __iter__ method
+#    - Functions that take a handle, and return an iterator object
+#    - Generator functions that take a handle, and yield SeqRecord objects
+#
+# It is then trivial to turn this iterator into a list of SeqRecord objects,
+# an in memory dictionary, or a multiple sequence alignment object.
+#
+# For building the dictionary by default the id property of each SeqRecord is
+# used as the key.  You should always populate the id property, and it should
+# be unique in most cases. For some file formats the accession number is a good
+# choice.  If the file itself contains ambiguous identifiers, don't try and
+# dis-ambiguate them - return them as is.
+#
+# When adding a new file format, please use the same lower case format name
+# as BioPerl, or if they have not defined one, try the names used by EMBOSS.
+#
+# See also http://biopython.org/wiki/SeqIO_dev
+#
+# --Peter
+from Bio.Align import MultipleSeqAlignment
+from Bio.File import as_handle
+from Bio.SeqIO import AbiIO
+from Bio.SeqIO import AceIO
+from Bio.SeqIO import FastaIO
+from Bio.SeqIO import GckIO
+from Bio.SeqIO import IgIO  # IntelliGenetics or MASE format
+from Bio.SeqIO import InsdcIO  # EMBL and GenBank
+from Bio.SeqIO import NibIO
+from Bio.SeqIO import PdbIO
+from Bio.SeqIO import PhdIO
+from Bio.SeqIO import PirIO
+from Bio.SeqIO import QualityIO  # FastQ and qual files
+from Bio.SeqIO import SeqXmlIO
+from Bio.SeqIO import SffIO
+from Bio.SeqIO import SnapGeneIO
+from Bio.SeqIO import SwissIO
+from Bio.SeqIO import TabIO
+from Bio.SeqIO import TwoBitIO
+from Bio.SeqIO import UniprotIO
+from Bio.SeqIO import XdnaIO
+from Bio.SeqRecord import SeqRecord
+
+# Convention for format names is "mainname-subtype" in lower case.
+# Please use the same names as BioPerl or EMBOSS where possible.
+#
+# Note that this simple system copes with defining
+# multiple possible iterators for a given format/extension
+# with the -subtype suffix
+#
+# Most alignment file formats will be handled via Bio.AlignIO
+
+_FormatToIterator = {
+    "abi": AbiIO.AbiIterator,
+    "abi-trim": AbiIO._AbiTrimIterator,
+    "ace": AceIO.AceIterator,
+    "fasta": FastaIO.FastaIterator,
+    "fasta-2line": FastaIO.FastaTwoLineIterator,
+    "ig": IgIO.IgIterator,
+    "embl": InsdcIO.EmblIterator,
+    "embl-cds": InsdcIO.EmblCdsFeatureIterator,
+    "gb": InsdcIO.GenBankIterator,
+    "gck": GckIO.GckIterator,
+    "genbank": InsdcIO.GenBankIterator,
+    "genbank-cds": InsdcIO.GenBankCdsFeatureIterator,
+    "imgt": InsdcIO.ImgtIterator,
+    "nib": NibIO.NibIterator,
+    "cif-seqres": PdbIO.CifSeqresIterator,
+    "cif-atom": PdbIO.CifAtomIterator,
+    "pdb-atom": PdbIO.PdbAtomIterator,
+    "pdb-seqres": PdbIO.PdbSeqresIterator,
+    "phd": PhdIO.PhdIterator,
+    "pir": PirIO.PirIterator,
+    "fastq": QualityIO.FastqPhredIterator,
+    "fastq-sanger": QualityIO.FastqPhredIterator,
+    "fastq-solexa": QualityIO.FastqSolexaIterator,
+    "fastq-illumina": QualityIO.FastqIlluminaIterator,
+    "qual": QualityIO.QualPhredIterator,
+    "seqxml": SeqXmlIO.SeqXmlIterator,
+    "sff": SffIO.SffIterator,
+    "snapgene": SnapGeneIO.SnapGeneIterator,
+    "sff-trim": SffIO._SffTrimIterator,  # Not sure about this in the long run
+    "swiss": SwissIO.SwissIterator,
+    "tab": TabIO.TabIterator,
+    "twobit": TwoBitIO.TwoBitIterator,
+    "uniprot-xml": UniprotIO.UniprotIterator,
+    "xdna": XdnaIO.XdnaIterator,
+}
+
+_FormatToString = {
+    "fasta": FastaIO.as_fasta,
+    "fasta-2line": FastaIO.as_fasta_2line,
+    "tab": TabIO.as_tab,
+    "fastq": QualityIO.as_fastq,
+    "fastq-sanger": QualityIO.as_fastq,
+    "fastq-solexa": QualityIO.as_fastq_solexa,
+    "fastq-illumina": QualityIO.as_fastq_illumina,
+    "qual": QualityIO.as_qual,
+}
+
+# This could exclude file formats covered by _FormatToString?
+# Right now used in the unit tests as proxy for all supported outputs...
+_FormatToWriter = {
+    "fasta": FastaIO.FastaWriter,
+    "fasta-2line": FastaIO.FastaTwoLineWriter,
+    "gb": InsdcIO.GenBankWriter,
+    "genbank": InsdcIO.GenBankWriter,
+    "embl": InsdcIO.EmblWriter,
+    "imgt": InsdcIO.ImgtWriter,
+    "nib": NibIO.NibWriter,
+    "phd": PhdIO.PhdWriter,
+    "pir": PirIO.PirWriter,
+    "fastq": QualityIO.FastqPhredWriter,
+    "fastq-sanger": QualityIO.FastqPhredWriter,
+    "fastq-solexa": QualityIO.FastqSolexaWriter,
+    "fastq-illumina": QualityIO.FastqIlluminaWriter,
+    "qual": QualityIO.QualPhredWriter,
+    "seqxml": SeqXmlIO.SeqXmlWriter,
+    "sff": SffIO.SffWriter,
+    "tab": TabIO.TabWriter,
+    "xdna": XdnaIO.XdnaWriter,
+}
+
+
+def write(sequences, handle, format):
+    """Write complete set of sequences to a file.
+
+    Arguments:
+     - sequences - A list (or iterator) of SeqRecord objects, or a single
+       SeqRecord.
+     - handle    - File handle object to write to, or filename as string.
+     - format    - lower case string describing the file format to write.
+
+    Note if providing a file handle, your code should close the handle
+    after calling this function (to ensure the data gets flushed to disk).
+
+    Returns the number of records written (as an integer).
+    """
+    from Bio import AlignIO
+
+    # Try and give helpful error messages:
+    if not isinstance(format, str):
+        raise TypeError("Need a string for the file format (lower case)")
+    if not format:
+        raise ValueError("Format required (lower case string)")
+    if not format.islower():
+        raise ValueError("Format string '%s' should be lower case" % format)
+
+    if isinstance(handle, SeqRecord):
+        raise TypeError("Check arguments, handle should NOT be a SeqRecord")
+    if isinstance(handle, list):
+        # e.g. list of SeqRecord objects
+        raise TypeError("Check arguments, handle should NOT be a list")
+
+    if isinstance(sequences, SeqRecord):
+        # This raised an exception in older versions of Biopython
+        sequences = [sequences]
+
+    # Map the file format to a writer function/class
+    format_function = _FormatToString.get(format)
+    if format_function is not None:
+        count = 0
+        with as_handle(handle, "w") as fp:
+            for record in sequences:
+                fp.write(format_function(record))
+                count += 1
+        return count
+
+    writer_class = _FormatToWriter.get(format)
+    if writer_class is not None:
+        count = writer_class(handle).write_file(sequences)
+        if not isinstance(count, int):
+            raise RuntimeError(
+                "Internal error - the underlying %s writer "
+                "should have returned the record count, not %r" % (format, count)
+            )
+        return count
+
+    if format in AlignIO._FormatToWriter:
+        # Try and turn all the records into a single alignment,
+        # and write that using Bio.AlignIO
+        alignment = MultipleSeqAlignment(sequences)
+        alignment_count = AlignIO.write([alignment], handle, format)
+        if alignment_count != 1:
+            raise RuntimeError(
+                "Internal error - the underlying writer "
+                "should have returned 1, not %r" % alignment_count
+            )
+        count = len(alignment)
+        return count
+
+    if format in _FormatToIterator or format in AlignIO._FormatToIterator:
+        raise ValueError("Reading format '%s' is supported, but not writing" % format)
+
+    raise ValueError("Unknown format '%s'" % format)
+
+
+def parse(handle, format, alphabet=None):
+    r"""Turn a sequence file into an iterator returning SeqRecords.
+
+    Arguments:
+     - handle   - handle to the file, or the filename as a string
+       (note older versions of Biopython only took a handle).
+     - format   - lower case string describing the file format.
+     - alphabet - no longer used, should be None.
+
+    Typical usage, opening a file to read in, and looping over the record(s):
+
+    >>> from Bio import SeqIO
+    >>> filename = "Fasta/sweetpea.nu"
+    >>> for record in SeqIO.parse(filename, "fasta"):
+    ...    print("ID %s" % record.id)
+    ...    print("Sequence length %i" % len(record))
+    ID gi|3176602|gb|U78617.1|LOU78617
+    Sequence length 309
+
+    For lazy-loading file formats such as twobit, for which the file contents
+    is read on demand only, ensure that the file remains open while extracting
+    sequence data.
+
+    If you have a string 'data' containing the file contents, you must
+    first turn this into a handle in order to parse it:
+
+    >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
+    >>> from Bio import SeqIO
+    >>> from io import StringIO
+    >>> for record in SeqIO.parse(StringIO(data), "fasta"):
+    ...     print("%s %s" % (record.id, record.seq))
+    Alpha ACCGGATGTA
+    Beta AGGCTCGGTTA
+
+    Use the Bio.SeqIO.read(...) function when you expect a single record
+    only.
+    """
+    # NOTE - The above docstring has some raw \n characters needed
+    # for the StringIO example, hence the whole docstring is in raw
+    # string mode (see the leading r before the opening quote).
+    from Bio import AlignIO
+
+    # Try and give helpful error messages:
+    if not isinstance(format, str):
+        raise TypeError("Need a string for the file format (lower case)")
+    if not format:
+        raise ValueError("Format required (lower case string)")
+    if not format.islower():
+        raise ValueError("Format string '%s' should be lower case" % format)
+    if alphabet is not None:
+        raise ValueError("The alphabet argument is no longer supported")
+
+    iterator_generator = _FormatToIterator.get(format)
+    if iterator_generator:
+        return iterator_generator(handle)
+    if format in AlignIO._FormatToIterator:
+        # Use Bio.AlignIO to read in the alignments
+        return (r for alignment in AlignIO.parse(handle, format) for r in alignment)
+    raise ValueError("Unknown format '%s'" % format)
+
+
+def read(handle, format, alphabet=None):
+    """Turn a sequence file into a single SeqRecord.
+
+    Arguments:
+     - handle   - handle to the file, or the filename as a string
+       (note older versions of Biopython only took a handle).
+     - format   - string describing the file format.
+     - alphabet - no longer used, should be None.
+
+    This function is for use parsing sequence files containing
+    exactly one record.  For example, reading a GenBank file:
+
+    >>> from Bio import SeqIO
+    >>> record = SeqIO.read("GenBank/arab1.gb", "genbank")
+    >>> print("ID %s" % record.id)
+    ID AC007323.5
+    >>> print("Sequence length %i" % len(record))
+    Sequence length 86436
+
+    If the handle contains no records, or more than one record,
+    an exception is raised.  For example:
+
+    >>> from Bio import SeqIO
+    >>> record = SeqIO.read("GenBank/cor6_6.gb", "genbank")
+    Traceback (most recent call last):
+        ...
+    ValueError: More than one record found in handle
+
+    If however you want the first record from a file containing
+    multiple records this function would raise an exception (as
+    shown in the example above).  Instead use:
+
+    >>> from Bio import SeqIO
+    >>> record = next(SeqIO.parse("GenBank/cor6_6.gb", "genbank"))
+    >>> print("First record's ID %s" % record.id)
+    First record's ID X55053.1
+
+    Use the Bio.SeqIO.parse(handle, format) function if you want
+    to read multiple records from the handle.
+    """
+    iterator = parse(handle, format, alphabet)
+    try:
+        record = next(iterator)
+    except StopIteration:
+        raise ValueError("No records found in handle") from None
+    try:
+        next(iterator)
+        raise ValueError("More than one record found in handle")
+    except StopIteration:
+        pass
+    return record
+
+
+def to_dict(sequences, key_function=None):
+    """Turn a sequence iterator or list into a dictionary.
+
+    Arguments:
+     - sequences  - An iterator that returns SeqRecord objects,
+       or simply a list of SeqRecord objects.
+     - key_function - Optional callback function which when given a
+       SeqRecord should return a unique key for the dictionary.
+
+    e.g. key_function = lambda rec : rec.name
+    or,  key_function = lambda rec : rec.description.split()[0]
+
+    If key_function is omitted then record.id is used, on the assumption
+    that the records objects returned are SeqRecords with a unique id.
+
+    If there are duplicate keys, an error is raised.
+
+    Since Python 3.7, the default dict class maintains key order, meaning
+    this dictionary will reflect the order of records given to it. For
+    CPython and PyPy, this was already implemented for Python 3.6, so
+    effectively you can always assume the record order is preserved.
+
+    Example usage, defaulting to using the record.id as key:
+
+    >>> from Bio import SeqIO
+    >>> filename = "GenBank/cor6_6.gb"
+    >>> format = "genbank"
+    >>> id_dict = SeqIO.to_dict(SeqIO.parse(filename, format))
+    >>> print(list(id_dict))
+    ['X55053.1', 'X62281.1', 'M81224.1', 'AJ237582.1', 'L31939.1', 'AF297471.1']
+    >>> print(id_dict["L31939.1"].description)
+    Brassica rapa (clone bif72) kin mRNA, complete cds
+
+    A more complex example, using the key_function argument in order to
+    use a sequence checksum as the dictionary key:
+
+    >>> from Bio import SeqIO
+    >>> from Bio.SeqUtils.CheckSum import seguid
+    >>> filename = "GenBank/cor6_6.gb"
+    >>> format = "genbank"
+    >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(filename, format),
+    ...               key_function = lambda rec : seguid(rec.seq))
+    >>> for key, record in sorted(seguid_dict.items()):
+    ...     print("%s %s" % (key, record.id))
+    /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
+    BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
+    SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
+    TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
+    l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
+    uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1
+
+    This approach is not suitable for very large sets of sequences, as all
+    the SeqRecord objects are held in memory. Instead, consider using the
+    Bio.SeqIO.index() function (if it supports your particular file format).
+
+    Since Python 3.6, the default dict class maintains key order, meaning
+    this dictionary will reflect the order of records given to it. As of
+    Biopython 1.72, on older versions of Python we explicitly use an
+    OrderedDict so that you can always assume the record order is preserved.
+    """
+    # This is to avoid a lambda function:
+
+    def _default_key_function(rec):
+        return rec.id
+
+    if key_function is None:
+        key_function = _default_key_function
+
+    d = {}
+    for record in sequences:
+        key = key_function(record)
+        if key in d:
+            raise ValueError("Duplicate key '%s'" % key)
+        d[key] = record
+    return d
+
+
+def index(filename, format, alphabet=None, key_function=None):
+    """Indexes a sequence file and returns a dictionary like object.
+
+    Arguments:
+     - filename - string giving name of file to be indexed
+     - format   - lower case string describing the file format
+     - alphabet - no longer used, leave as None
+     - key_function - Optional callback function which when given a
+       SeqRecord identifier string should return a unique key for the
+       dictionary.
+
+    This indexing function will return a dictionary like object, giving the
+    SeqRecord objects as values.
+
+    As of Biopython 1.69, this will preserve the ordering of the records in
+    file when iterating over the entries.
+
+    >>> from Bio import SeqIO
+    >>> records = SeqIO.index("Quality/example.fastq", "fastq")
+    >>> len(records)
+    3
+    >>> list(records)  # make a list of the keys
+    ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_540_792', 'EAS54_6_R1_2_1_443_348']
+    >>> print(records["EAS54_6_R1_2_1_540_792"].format("fasta"))
+    >EAS54_6_R1_2_1_540_792
+    TTGGCAGGCCAAGGCCGATGGATCA
+    
+    >>> "EAS54_6_R1_2_1_540_792" in records
+    True
+    >>> print(records.get("Missing", None))
+    None
+    >>> records.close()
+
+    If the file is BGZF compressed, this is detected automatically. Ordinary
+    GZIP files are not supported:
+
+    >>> from Bio import SeqIO
+    >>> records = SeqIO.index("Quality/example.fastq.bgz", "fastq")
+    >>> len(records)
+    3
+    >>> print(records["EAS54_6_R1_2_1_540_792"].seq)
+    TTGGCAGGCCAAGGCCGATGGATCA
+    >>> records.close()
+
+    When you call the index function, it will scan through the file, noting
+    the location of each record. When you access a particular record via the
+    dictionary methods, the code will jump to the appropriate part of the
+    file and then parse that section into a SeqRecord.
+
+    Note that not all the input formats supported by Bio.SeqIO can be used
+    with this index function. It is designed to work only with sequential
+    file formats (e.g. "fasta", "gb", "fastq") and is not suitable for any
+    interlaced file format (e.g. alignment formats such as "clustal").
+
+    For small files, it may be more efficient to use an in memory Python
+    dictionary, e.g.
+
+    >>> from Bio import SeqIO
+    >>> records = SeqIO.to_dict(SeqIO.parse("Quality/example.fastq", "fastq"))
+    >>> len(records)
+    3
+    >>> list(records)  # make a list of the keys
+    ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_540_792', 'EAS54_6_R1_2_1_443_348']
+    >>> print(records["EAS54_6_R1_2_1_540_792"].format("fasta"))
+    >EAS54_6_R1_2_1_540_792
+    TTGGCAGGCCAAGGCCGATGGATCA
+    
+
+    As with the to_dict() function, by default the id string of each record
+    is used as the key. You can specify a callback function to transform
+    this (the record identifier string) into your preferred key. For example:
+
+    >>> from Bio import SeqIO
+    >>> def make_tuple(identifier):
+    ...     parts = identifier.split("_")
+    ...     return int(parts[-2]), int(parts[-1])
+    >>> records = SeqIO.index("Quality/example.fastq", "fastq",
+    ...                       key_function=make_tuple)
+    >>> len(records)
+    3
+    >>> list(records)  # make a list of the keys
+    [(413, 324), (540, 792), (443, 348)]
+    >>> print(records[(540, 792)].format("fasta"))
+    >EAS54_6_R1_2_1_540_792
+    TTGGCAGGCCAAGGCCGATGGATCA
+    
+    >>> (540, 792) in records
+    True
+    >>> "EAS54_6_R1_2_1_540_792" in records
+    False
+    >>> print(records.get("Missing", None))
+    None
+    >>> records.close()
+
+    Another common use case would be indexing an NCBI style FASTA file,
+    where you might want to extract the GI number from the FASTA identifier
+    to use as the dictionary key.
+
+    Notice that unlike the to_dict() function, here the key_function does
+    not get given the full SeqRecord to use to generate the key. Doing so
+    would impose a severe performance penalty as it would require the file
+    to be completely parsed while building the index. Right now this is
+    usually avoided.
+
+    See Also: Bio.SeqIO.index_db() and Bio.SeqIO.to_dict()
+
+    """
+    # Try and give helpful error messages:
+    if not isinstance(filename, str):
+        raise TypeError("Need a filename (not a handle)")
+    if not isinstance(format, str):
+        raise TypeError("Need a string for the file format (lower case)")
+    if not format:
+        raise ValueError("Format required (lower case string)")
+    if not format.islower():
+        raise ValueError("Format string '%s' should be lower case" % format)
+    if alphabet is not None:
+        raise ValueError("The alphabet argument is no longer supported")
+
+    # Map the file format to a sequence iterator:
+    from ._index import _FormatToRandomAccess  # Lazy import
+    from Bio.File import _IndexedSeqFileDict
+
+    try:
+        proxy_class = _FormatToRandomAccess[format]
+    except KeyError:
+        raise ValueError("Unsupported format %r" % format) from None
+    repr = "SeqIO.index(%r, %r, alphabet=%r, key_function=%r)" % (
+        filename,
+        format,
+        alphabet,
+        key_function,
+    )
+    return _IndexedSeqFileDict(
+        proxy_class(filename, format), key_function, repr, "SeqRecord"
+    )
+
+
+def index_db(
+    index_filename, filenames=None, format=None, alphabet=None, key_function=None
+):
+    """Index several sequence files and return a dictionary like object.
+
+    The index is stored in an SQLite database rather than in memory (as in the
+    Bio.SeqIO.index(...) function).
+
+    Arguments:
+     - index_filename - Where to store the SQLite index
+     - filenames - list of strings specifying file(s) to be indexed, or when
+       indexing a single file this can be given as a string.
+       (optional if reloading an existing index, but must match)
+     - format   - lower case string describing the file format
+       (optional if reloading an existing index, but must match)
+     - alphabet - no longer used, leave as None.
+     - key_function - Optional callback function which when given a
+       SeqRecord identifier string should return a unique
+       key for the dictionary.
+
+    This indexing function will return a dictionary like object, giving the
+    SeqRecord objects as values:
+
+    >>> from Bio import SeqIO
+    >>> files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"]
+    >>> def get_gi(name):
+    ...     parts = name.split("|")
+    ...     i = parts.index("gi")
+    ...     assert i != -1
+    ...     return parts[i+1]
+    >>> idx_name = ":memory:" #use an in memory SQLite DB for this test
+    >>> records = SeqIO.index_db(idx_name, files, "fasta", key_function=get_gi)
+    >>> len(records)
+    95
+    >>> records["7525076"].description
+    'gi|7525076|ref|NP_051101.1| Ycf2 [Arabidopsis thaliana]'
+    >>> records["45478717"].description
+    'gi|45478717|ref|NP_995572.1| pesticin [Yersinia pestis biovar Microtus str. 91001]'
+    >>> records.close()
+
+    In this example the two files contain 85 and 10 records respectively.
+
+    BGZF compressed files are supported, and detected automatically. Ordinary
+    GZIP compressed files are not supported.
+
+    See Also: Bio.SeqIO.index() and Bio.SeqIO.to_dict(), and the Python module
+    glob which is useful for building lists of files.
+
+    """
+    # Try and give helpful error messages:
+    if not isinstance(index_filename, str):
+        raise TypeError("Need a string for the index filename")
+    if isinstance(filenames, str):
+        # Make the API a little more friendly, and more similar
+        # to Bio.SeqIO.index(...) for indexing just one file.
+        filenames = [filenames]
+    if filenames is not None and not isinstance(filenames, list):
+        raise TypeError("Need a list of filenames (as strings), or one filename")
+    if format is not None and not isinstance(format, str):
+        raise TypeError("Need a string for the file format (lower case)")
+    if format and not format.islower():
+        raise ValueError("Format string '%s' should be lower case" % format)
+    if alphabet is not None:
+        raise ValueError("The alphabet argument is no longer supported")
+
+    # Map the file format to a sequence iterator:
+    from ._index import _FormatToRandomAccess  # Lazy import
+    from Bio.File import _SQLiteManySeqFilesDict
+
+    repr = "SeqIO.index_db(%r, filenames=%r, format=%r, key_function=%r)" % (
+        index_filename,
+        filenames,
+        format,
+        key_function,
+    )
+
+    def proxy_factory(format, filename=None):
+        """Given a filename returns proxy object, else boolean if format OK."""
+        if filename:
+            return _FormatToRandomAccess[format](filename, format)
+        else:
+            return format in _FormatToRandomAccess
+
+    return _SQLiteManySeqFilesDict(
+        index_filename, filenames, proxy_factory, format, key_function, repr
+    )
+
+
+# TODO? - Handling aliases explicitly would let us shorten this list:
+_converter = {
+    ("genbank", "fasta"): InsdcIO._genbank_convert_fasta,
+    ("gb", "fasta"): InsdcIO._genbank_convert_fasta,
+    ("embl", "fasta"): InsdcIO._embl_convert_fasta,
+    ("fastq", "fasta"): QualityIO._fastq_convert_fasta,
+    ("fastq-sanger", "fasta"): QualityIO._fastq_convert_fasta,
+    ("fastq-solexa", "fasta"): QualityIO._fastq_convert_fasta,
+    ("fastq-illumina", "fasta"): QualityIO._fastq_convert_fasta,
+    ("fastq", "tab"): QualityIO._fastq_convert_tab,
+    ("fastq-sanger", "tab"): QualityIO._fastq_convert_tab,
+    ("fastq-solexa", "tab"): QualityIO._fastq_convert_tab,
+    ("fastq-illumina", "tab"): QualityIO._fastq_convert_tab,
+    ("fastq", "fastq"): QualityIO._fastq_sanger_convert_fastq_sanger,
+    ("fastq-sanger", "fastq"): QualityIO._fastq_sanger_convert_fastq_sanger,
+    ("fastq-solexa", "fastq"): QualityIO._fastq_solexa_convert_fastq_sanger,
+    ("fastq-illumina", "fastq"): QualityIO._fastq_illumina_convert_fastq_sanger,
+    ("fastq", "fastq-sanger"): QualityIO._fastq_sanger_convert_fastq_sanger,
+    ("fastq-sanger", "fastq-sanger"): QualityIO._fastq_sanger_convert_fastq_sanger,
+    ("fastq-solexa", "fastq-sanger"): QualityIO._fastq_solexa_convert_fastq_sanger,
+    ("fastq-illumina", "fastq-sanger"): QualityIO._fastq_illumina_convert_fastq_sanger,
+    ("fastq", "fastq-solexa"): QualityIO._fastq_sanger_convert_fastq_solexa,
+    ("fastq-sanger", "fastq-solexa"): QualityIO._fastq_sanger_convert_fastq_solexa,
+    ("fastq-solexa", "fastq-solexa"): QualityIO._fastq_solexa_convert_fastq_solexa,
+    ("fastq-illumina", "fastq-solexa"): QualityIO._fastq_illumina_convert_fastq_solexa,
+    ("fastq", "fastq-illumina"): QualityIO._fastq_sanger_convert_fastq_illumina,
+    ("fastq-sanger", "fastq-illumina"): QualityIO._fastq_sanger_convert_fastq_illumina,
+    ("fastq-solexa", "fastq-illumina"): QualityIO._fastq_solexa_convert_fastq_illumina,
+    (
+        "fastq-illumina",
+        "fastq-illumina",
+    ): QualityIO._fastq_illumina_convert_fastq_illumina,
+    ("fastq", "qual"): QualityIO._fastq_sanger_convert_qual,
+    ("fastq-sanger", "qual"): QualityIO._fastq_sanger_convert_qual,
+    ("fastq-solexa", "qual"): QualityIO._fastq_solexa_convert_qual,
+    ("fastq-illumina", "qual"): QualityIO._fastq_illumina_convert_qual,
+}
+
+
+def convert(in_file, in_format, out_file, out_format, molecule_type=None):
+    """Convert between two sequence file formats, return number of records.
+
+    Arguments:
+     - in_file - an input handle or filename
+     - in_format - input file format, lower case string
+     - out_file - an output handle or filename
+     - out_format - output file format, lower case string
+     - molecule_type - optional molecule type to apply, string containing
+       "DNA", "RNA" or "protein".
+
+    **NOTE** - If you provide an output filename, it will be opened which will
+    overwrite any existing file without warning.
+
+    The idea here is that while doing this will work::
+
+        from Bio import SeqIO
+        records = SeqIO.parse(in_handle, in_format)
+        count = SeqIO.write(records, out_handle, out_format)
+
+    it is shorter to write::
+
+        from Bio import SeqIO
+        count = SeqIO.convert(in_handle, in_format, out_handle, out_format)
+
+    Also, Bio.SeqIO.convert is faster for some conversions as it can make some
+    optimisations.
+
+    For example, going from a filename to a handle:
+
+    >>> from Bio import SeqIO
+    >>> from io import StringIO
+    >>> handle = StringIO("")
+    >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
+    3
+    >>> print(handle.getvalue())
+    >EAS54_6_R1_2_1_413_324
+    CCCTTCTTGTCTTCAGCGTTTCTCC
+    >EAS54_6_R1_2_1_540_792
+    TTGGCAGGCCAAGGCCGATGGATCA
+    >EAS54_6_R1_2_1_443_348
+    GTTGCTTCTGGCGTGGGTGGGGGGG
+    
+
+    Note some formats like SeqXML require you to specify the molecule type
+    when it cannot be determined by the parser:
+
+    >>> from Bio import SeqIO
+    >>> from io import BytesIO
+    >>> handle = BytesIO()
+    >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "seqxml", "DNA")
+    3
+    """
+    if molecule_type:
+        if not isinstance(molecule_type, str):
+            raise TypeError("Molecule type should be a string, not %r" % molecule_type)
+        elif (
+            "DNA" in molecule_type
+            or "RNA" in molecule_type
+            or "protein" in molecule_type
+        ):
+            pass
+        else:
+            raise ValueError("Unexpected molecule type, %r" % molecule_type)
+    f = _converter.get((in_format, out_format))
+    if f:
+        count = f(in_file, out_file)
+    else:
+        records = parse(in_file, in_format)
+        if molecule_type:
+            # Edit the records on the fly to set molecule type
+
+            def over_ride(record):
+                """Over-ride molecule in-place."""
+                record.annotations["molecule_type"] = molecule_type
+                return record
+
+            records = (over_ride(_) for _ in records)
+        count = write(records, out_file, out_format)
+    return count
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqIO/__pycache__/AbiIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/AbiIO.cpython-37.pyc
new file mode 100644
index 0000000..c8d472e
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/AbiIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/AceIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/AceIO.cpython-37.pyc
new file mode 100644
index 0000000..fa2f285
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/AceIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/FastaIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/FastaIO.cpython-37.pyc
new file mode 100644
index 0000000..042a384
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/FastaIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/GckIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/GckIO.cpython-37.pyc
new file mode 100644
index 0000000..93cbcde
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/GckIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/IgIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/IgIO.cpython-37.pyc
new file mode 100644
index 0000000..a874469
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/IgIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/InsdcIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/InsdcIO.cpython-37.pyc
new file mode 100644
index 0000000..f24aec4
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/InsdcIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/Interfaces.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/Interfaces.cpython-37.pyc
new file mode 100644
index 0000000..4c8a80f
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/Interfaces.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/NibIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/NibIO.cpython-37.pyc
new file mode 100644
index 0000000..8e10f6c
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/NibIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/PdbIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/PdbIO.cpython-37.pyc
new file mode 100644
index 0000000..89efc3b
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/PdbIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/PhdIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/PhdIO.cpython-37.pyc
new file mode 100644
index 0000000..e9c0140
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/PhdIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/PirIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/PirIO.cpython-37.pyc
new file mode 100644
index 0000000..6c94458
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/PirIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/QualityIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/QualityIO.cpython-37.pyc
new file mode 100644
index 0000000..8c06475
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/QualityIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/SeqXmlIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/SeqXmlIO.cpython-37.pyc
new file mode 100644
index 0000000..51dc7db
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/SeqXmlIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/SffIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/SffIO.cpython-37.pyc
new file mode 100644
index 0000000..7c8f35b
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/SffIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/SnapGeneIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/SnapGeneIO.cpython-37.pyc
new file mode 100644
index 0000000..60c0ede
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/SnapGeneIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/SwissIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/SwissIO.cpython-37.pyc
new file mode 100644
index 0000000..180c01b
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/SwissIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/TabIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/TabIO.cpython-37.pyc
new file mode 100644
index 0000000..402ee74
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/TabIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/TwoBitIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/TwoBitIO.cpython-37.pyc
new file mode 100644
index 0000000..4611e38
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/TwoBitIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/UniprotIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/UniprotIO.cpython-37.pyc
new file mode 100644
index 0000000..dcb0cc7
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/UniprotIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/XdnaIO.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/XdnaIO.cpython-37.pyc
new file mode 100644
index 0000000..5ef7092
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/XdnaIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..7119cbc
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/__pycache__/_index.cpython-37.pyc b/code/lib/Bio/SeqIO/__pycache__/_index.cpython-37.pyc
new file mode 100644
index 0000000..2d5b738
Binary files /dev/null and b/code/lib/Bio/SeqIO/__pycache__/_index.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqIO/_index.py b/code/lib/Bio/SeqIO/_index.py
new file mode 100644
index 0000000..560b1c2
--- /dev/null
+++ b/code/lib/Bio/SeqIO/_index.py
@@ -0,0 +1,713 @@
+# Copyright 2009-2011 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Dictionary like indexing of sequence files (PRIVATE).
+
+You are not expected to access this module, or any of its code, directly. This
+is all handled internally by the Bio.SeqIO.index(...) and index_db(...)
+functions which are the public interface for this functionality.
+
+The basic idea is that we scan over a sequence file, looking for new record
+markers. We then try to extract the string that Bio.SeqIO.parse/read would
+use as the record id, ideally without actually parsing the full record. We
+then use a subclassed Python dictionary to record the file offset for the
+record start against the record id.
+
+Note that this means full parsing is on demand, so any invalid or problem
+record may not trigger an exception until it is accessed. This is by design.
+
+This means our dictionary like objects have in memory ALL the keys (all the
+record identifiers), which shouldn't be a problem even with second generation
+sequencing. If memory is an issue, the index_db(...) interface stores the
+keys and offsets in an SQLite database - which can be re-used to avoid
+re-indexing the file for use another time.
+"""
+import re
+
+from io import BytesIO
+from io import StringIO
+
+from Bio import SeqIO
+from Bio.File import _IndexedSeqFileProxy
+from Bio.File import _open_for_random_access
+
+
+class SeqFileRandomAccess(_IndexedSeqFileProxy):
+    """Base class for defining random access to sequence files."""
+
+    def __init__(self, filename, format):
+        """Initialize the class."""
+        self._handle = _open_for_random_access(filename)
+        self._format = format
+        # Load the parser class/function once an avoid the dict lookup in each
+        # __getitem__ call:
+        self._iterator = SeqIO._FormatToIterator[format]
+
+    def get(self, offset):
+        """Return SeqRecord."""
+        # Should be overridden for binary file formats etc:
+        return next(self._iterator(StringIO(self.get_raw(offset).decode())))
+
+
+####################
+# Special indexers #
+####################
+# Anything where the records cannot be read simply by parsing from
+# the record start. For example, anything requiring information from
+# a file header - e.g. SFF files where we would need to know the
+# number of flows.
+class SffRandomAccess(SeqFileRandomAccess):
+    """Random access to a Standard Flowgram Format (SFF) file."""
+
+    def __init__(self, filename, format):
+        """Initialize the class."""
+        SeqFileRandomAccess.__init__(self, filename, format)
+        (
+            header_length,
+            index_offset,
+            index_length,
+            number_of_reads,
+            self._flows_per_read,
+            self._flow_chars,
+            self._key_sequence,
+        ) = SeqIO.SffIO._sff_file_header(self._handle)
+
+    def __iter__(self):
+        """Load any index block in the file, or build it the slow way (PRIVATE)."""
+        handle = self._handle
+        handle.seek(0)
+        # Alread did this in __init__ but need handle in right place
+        (
+            header_length,
+            index_offset,
+            index_length,
+            number_of_reads,
+            self._flows_per_read,
+            self._flow_chars,
+            self._key_sequence,
+        ) = SeqIO.SffIO._sff_file_header(handle)
+        if index_offset and index_length:
+            # There is an index provided, try this the fast way:
+            count = 0
+            max_offset = 0
+            try:
+                for name, offset in SeqIO.SffIO._sff_read_roche_index(handle):
+                    max_offset = max(max_offset, offset)
+                    yield name, offset, 0
+                    count += 1
+                if count != number_of_reads:
+                    raise ValueError(
+                        "Indexed %i records, expected %i" % (count, number_of_reads)
+                    )
+                # If that worked, call _check_eof ...
+            except ValueError as err:
+                import warnings
+                from Bio import BiopythonParserWarning
+
+                warnings.warn(
+                    "Could not parse the SFF index: %s" % err, BiopythonParserWarning
+                )
+                assert count == 0, "Partially populated index"
+                handle.seek(0)
+                # Drop out to the slow way...
+            else:
+                # Fast way worked, check EOF
+                if index_offset + index_length <= max_offset:
+                    # Can have an index at start (or mid-file)
+                    handle.seek(max_offset)
+                    # Parse the final read,
+                    SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
+                    # Should now be at the end of the file!
+                SeqIO.SffIO._check_eof(handle, index_offset, index_length)
+                return
+        # We used to give a warning in this case, but Ion Torrent's
+        # SFF files don't have an index so that would be annoying.
+        # Fall back on the slow way!
+        count = 0
+        for name, offset in SeqIO.SffIO._sff_do_slow_index(handle):
+            yield name, offset, 0
+            count += 1
+        if count != number_of_reads:
+            raise ValueError(
+                "Indexed %i records, expected %i" % (count, number_of_reads)
+            )
+        SeqIO.SffIO._check_eof(handle, index_offset, index_length)
+
+    def get(self, offset):
+        """Return the SeqRecord starting at the given offset."""
+        handle = self._handle
+        handle.seek(offset)
+        return SeqIO.SffIO._sff_read_seq_record(
+            handle, self._flows_per_read, self._flow_chars, self._key_sequence,
+        )
+
+    def get_raw(self, offset):
+        """Return the raw record from the file as a bytes string."""
+        handle = self._handle
+        handle.seek(offset)
+        return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
+
+
+class SffTrimedRandomAccess(SffRandomAccess):
+    """Random access to an SFF file with defined trimming applied to each sequence."""
+
+    def get(self, offset):
+        """Return the SeqRecord starting at the given offset."""
+        handle = self._handle
+        handle.seek(offset)
+        return SeqIO.SffIO._sff_read_seq_record(
+            handle,
+            self._flows_per_read,
+            self._flow_chars,
+            self._key_sequence,
+            trim=True,
+        )
+
+
+###################
+# Simple indexers #
+###################
+
+
+class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
+    """Random access to a simple sequential sequence file."""
+
+    def __init__(self, filename, format):
+        """Initialize the class."""
+        SeqFileRandomAccess.__init__(self, filename, format)
+        marker = {
+            "ace": b"CO ",
+            "embl": b"ID ",
+            "fasta": b">",
+            "genbank": b"LOCUS ",
+            "gb": b"LOCUS ",
+            "imgt": b"ID ",
+            "phd": b"BEGIN_SEQUENCE",
+            "pir": b">..;",
+            "qual": b">",
+            "swiss": b"ID ",
+            "uniprot-xml": b" end of this record
+                break
+            lines.append(line)
+        return b"".join(lines)
+
+
+#######################################
+# Fiddly indexers: GenBank, EMBL, ... #
+#######################################
+
+
+class GenBankRandomAccess(SequentialSeqFileRandomAccess):
+    """Indexed dictionary like access to a GenBank file."""
+
+    def __iter__(self):
+        """Iterate over the sequence records in the file."""
+        handle = self._handle
+        handle.seek(0)
+        marker_re = self._marker_re
+        accession_marker = b"ACCESSION "
+        version_marker = b"VERSION "
+        # Skip and header before first record
+        while True:
+            start_offset = handle.tell()
+            line = handle.readline()
+            if marker_re.match(line) or not line:
+                break
+        # Should now be at the start of a record, or end of the file
+        while marker_re.match(line):
+            # We cannot assume the record.id is the first word after LOCUS,
+            # normally the first entry on the VERSION or ACCESSION line is used.
+            # However if both missing, GenBank parser falls back on LOCUS entry.
+            try:
+                key = line[5:].split(None, 1)[0]
+            except ValueError:
+                # Warning?
+                # No content in LOCUS line
+                key = None
+            length = len(line)
+            while True:
+                end_offset = handle.tell()
+                line = handle.readline()
+                if marker_re.match(line) or not line:
+                    if not key:
+                        raise ValueError(
+                            "Did not find usable ACCESSION/VERSION/LOCUS lines"
+                        )
+                    yield key.decode(), start_offset, length
+                    start_offset = end_offset
+                    break
+                elif line.startswith(accession_marker):
+                    try:
+                        key = line.rstrip().split()[1]
+                    except IndexError:
+                        # No content in ACCESSION line
+                        pass
+                elif line.startswith(version_marker):
+                    try:
+                        version_id = line.rstrip().split()[1]
+                        if (
+                            version_id.count(b".") == 1
+                            and version_id.split(b".")[1].isdigit()
+                        ):
+                            # This should mimic the GenBank parser...
+                            key = version_id
+                    except IndexError:
+                        # No content in VERSION line
+                        pass
+
+                length += len(line)
+        assert not line, repr(line)
+
+
+class EmblRandomAccess(SequentialSeqFileRandomAccess):
+    """Indexed dictionary like access to an EMBL file."""
+
+    def __iter__(self):
+        """Iterate over the sequence records in the file."""
+        handle = self._handle
+        handle.seek(0)
+        marker_re = self._marker_re
+        sv_marker = b"SV "
+        ac_marker = b"AC "
+        # Skip any header before first record
+        while True:
+            start_offset = handle.tell()
+            line = handle.readline()
+            if marker_re.match(line) or not line:
+                break
+        # Should now be at the start of a record, or end of the file
+        while marker_re.match(line):
+            # We cannot assume the record.id is the first word after ID,
+            # normally the SV line is used.
+            setbysv = False  # resets sv as false
+            length = len(line)
+            if line[2:].count(b";") in [5, 6]:
+                # Looks like the semi colon separated style introduced in 2006
+                # Or style from IPD-IMGT/HLA after their v3.16.0 release
+                parts = line[3:].rstrip().split(b";")
+                if parts[1].strip().startswith(sv_marker):
+                    # The SV bit gives the version
+                    key = parts[0].strip() + b"." + parts[1].strip().split()[1]
+                    setbysv = True
+                else:
+                    key = parts[0].strip()
+            elif line[2:].count(b";") in [2, 3]:
+                # Looks like the pre 2006 style, take first word only
+                # Or, with two colons, the KIPO patent variation
+                key = line[3:].strip().split(None, 1)[0]
+                if key.endswith(b";"):
+                    key = key[:-1]
+            else:
+                raise ValueError("Did not recognise the ID line layout:\n%r" % line)
+            while True:
+                line = handle.readline()
+                if marker_re.match(line) or not line:
+                    end_offset = handle.tell() - len(line)
+                    yield key.decode(), start_offset, length
+                    start_offset = end_offset
+                    break
+                elif line.startswith(ac_marker) and not setbysv:
+                    key = line.rstrip().split()[1]
+                    if key.endswith(b";"):
+                        key = key[:-1]
+                elif line.startswith(sv_marker):
+                    key = line.rstrip().split()[1]
+                    setbysv = True
+                length += len(line)
+        assert not line, repr(line)
+
+
+class SwissRandomAccess(SequentialSeqFileRandomAccess):
+    """Random access to a SwissProt file."""
+
+    def __iter__(self):
+        """Iterate over the sequence records in the file."""
+        handle = self._handle
+        handle.seek(0)
+        marker_re = self._marker_re
+        # Skip any header before first record
+        while True:
+            start_offset = handle.tell()
+            line = handle.readline()
+            if marker_re.match(line) or not line:
+                break
+        # Should now be at the start of a record, or end of the file
+        while marker_re.match(line):
+            length = len(line)
+            # We cannot assume the record.id is the first word after ID,
+            # normally the following AC line is used.
+            line = handle.readline()
+            length += len(line)
+            assert line.startswith(b"AC ")
+            key = line[3:].strip().split(b";")[0].strip()
+            while True:
+                end_offset = handle.tell()
+                line = handle.readline()
+                if marker_re.match(line) or not line:
+                    yield key.decode(), start_offset, length
+                    start_offset = end_offset
+                    break
+                length += len(line)
+        assert not line, repr(line)
+
+
+class UniprotRandomAccess(SequentialSeqFileRandomAccess):
+    """Random access to a UniProt XML file."""
+
+    def __iter__(self):
+        """Iterate over the sequence records in the file."""
+        handle = self._handle
+        handle.seek(0)
+        marker_re = self._marker_re
+        start_acc_marker = b""
+        end_acc_marker = b""
+        end_entry_marker = b""
+        # Skip any header before first record
+        while True:
+            start_offset = handle.tell()
+            line = handle.readline()
+            if marker_re.match(line) or not line:
+                break
+        # Should now be at the start of a record, or end of the file
+        while marker_re.match(line):
+            length = len(line)
+            # We expect the next line to be xxx
+            # (possibly with leading spaces)
+            # but allow it to be later on within the 
+            key = None
+            while True:
+                line = handle.readline()
+                if key is None and start_acc_marker in line:
+                    assert end_acc_marker in line, line
+                    key = line[line.find(start_acc_marker) + 11 :].split(b"<", 1)[0]
+                    length += len(line)
+                elif end_entry_marker in line:
+                    length += line.find(end_entry_marker) + 8
+                    end_offset = (
+                        handle.tell() - len(line) + line.find(end_entry_marker) + 8
+                    )
+                    assert start_offset + length == end_offset
+                    break
+                elif marker_re.match(line) or not line:
+                    # Start of next record or end of file
+                    raise ValueError("Didn't find end of record")
+                else:
+                    length += len(line)
+            if not key:
+                raise ValueError(
+                    "Did not find  line in bytes %i to %i"
+                    % (start_offset, start_offset + length)
+                )
+            yield key.decode(), start_offset, length
+            # Find start of next record
+            while not marker_re.match(line) and line:
+                start_offset = handle.tell()
+                line = handle.readline()
+        assert not line, repr(line)
+
+    def get_raw(self, offset):
+        """Return the raw record from the file as a bytes string."""
+        handle = self._handle
+        marker_re = self._marker_re
+        end_entry_marker = b""
+        handle.seek(offset)
+        data = [handle.readline()]
+        while True:
+            line = handle.readline()
+            i = line.find(end_entry_marker)
+            if i != -1:
+                data.append(line[: i + 8])
+                break
+            if marker_re.match(line) or not line:
+                # End of file, or start of next record
+                raise ValueError("Didn't find end of record")
+            data.append(line)
+        return b"".join(data)
+
+    def get(self, offset):
+        """Return the SeqRecord starting at the given offset."""
+        # TODO - Can we handle this directly in the parser?
+        # This is a hack - use get_raw for ... and wrap it with
+        # the apparently required XML header and footer.
+        data = (
+            b"""
+        
+        """
+            + self.get_raw(offset)
+            + b""
+        )
+        return next(SeqIO.UniprotIO.UniprotIterator(BytesIO(data)))
+
+
+class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
+    """Random access to a IntelliGenetics file."""
+
+    def __init__(self, filename, format):
+        """Initialize the class."""
+        SeqFileRandomAccess.__init__(self, filename, format)
+        self._marker_re = re.compile(b"^;")
+
+    def __iter__(self):
+        """Iterate over the sequence records in the file."""
+        handle = self._handle
+        handle.seek(0)
+        # Skip any header
+        offset = 0
+        line = ""
+        while True:
+            offset += len(line)
+            line = handle.readline()
+            if not line:
+                break  # Premature end of file, or just empty?
+            if not line.startswith(b";;"):
+                break
+        while line:
+            length = 0
+            assert offset + len(line) == handle.tell()
+            if not line.startswith(b";"):
+                raise ValueError("Records should start with ';' and not:\n%r" % line)
+            while line.startswith(b";"):
+                length += len(line)
+                line = handle.readline()
+            key = line.rstrip()
+            # Now look for the first line which starts ";"
+            while line and not line.startswith(b";"):
+                length += len(line)
+                line = handle.readline()
+            yield key.decode(), offset, length
+            offset += length
+            assert offset + len(line) == handle.tell()
+
+    def get_raw(self, offset):
+        """Return the raw record from the file as a bytes string."""
+        handle = self._handle
+        handle.seek(offset)
+        marker_re = self._marker_re
+        lines = []
+        line = handle.readline()
+        while line.startswith(b";"):
+            lines.append(line)
+            line = handle.readline()
+        while line and not line.startswith(b";"):
+            lines.append(line)
+            line = handle.readline()
+        return b"".join(lines)
+
+
+class TabRandomAccess(SeqFileRandomAccess):
+    """Random access to a simple tabbed file."""
+
+    def __iter__(self):
+        """Iterate over the sequence records in the file."""
+        handle = self._handle
+        handle.seek(0)
+        tab_char = b"\t"
+        while True:
+            start_offset = handle.tell()
+            line = handle.readline()
+            if not line:
+                break  # End of file
+            try:
+                key = line.split(tab_char)[0]
+            except ValueError:
+                if not line.strip():
+                    # Ignore blank lines
+                    continue
+                else:
+                    raise
+            else:
+                yield key.decode(), start_offset, len(line)
+
+    def get_raw(self, offset):
+        """Return the raw record from the file as a bytes string."""
+        handle = self._handle
+        handle.seek(offset)
+        return handle.readline()
+
+
+##########################
+# Now the FASTQ indexers #
+##########################
+
+
+class FastqRandomAccess(SeqFileRandomAccess):
+    """Random access to a FASTQ file (any supported variant).
+
+    With FASTQ the records all start with a "@" line, but so can quality lines.
+    Note this will cope with line-wrapped FASTQ files.
+    """
+
+    def __iter__(self):
+        """Iterate over the sequence records in the file."""
+        handle = self._handle
+        handle.seek(0)
+        id = None
+        start_offset = handle.tell()
+        line = handle.readline()
+        if not line:
+            # Empty file!
+            return
+        if line[0:1] != b"@":
+            raise ValueError("Problem with FASTQ @ line:\n%r" % line)
+        while line:
+            # assert line[0]=="@"
+            # This record seems OK (so far)
+            id = line[1:].rstrip().split(None, 1)[0]
+            # Find the seq line(s)
+            seq_len = 0
+            length = len(line)
+            while line:
+                line = handle.readline()
+                length += len(line)
+                if line.startswith(b"+"):
+                    break
+                seq_len += len(line.strip())
+            if not line:
+                raise ValueError("Premature end of file in seq section")
+            # assert line[0]=="+"
+            # Find the qual line(s)
+            qual_len = 0
+            while line:
+                if seq_len == qual_len:
+                    if seq_len == 0:
+                        # Special case, quality line should be just "\n"
+                        line = handle.readline()
+                        if line.strip():
+                            raise ValueError(
+                                "Expected blank quality line, not %r" % line
+                            )
+                        length += len(line)  # Need to include the blank ling
+                    # Should be end of record...
+                    end_offset = handle.tell()
+                    line = handle.readline()
+                    if line and line[0:1] != b"@":
+                        raise ValueError("Problem with line %r" % line)
+                    break
+                else:
+                    line = handle.readline()
+                    qual_len += len(line.strip())
+                    length += len(line)
+            if seq_len != qual_len:
+                raise ValueError("Problem with quality section")
+            yield id.decode(), start_offset, length
+            start_offset = end_offset
+        # print("EOF")
+
+    def get_raw(self, offset):
+        """Return the raw record from the file as a bytes string."""
+        # TODO - Refactor this and the __init__ method to reduce code duplication?
+        handle = self._handle
+        handle.seek(offset)
+        line = handle.readline()
+        data = line
+        if line[0:1] != b"@":
+            raise ValueError("Problem with FASTQ @ line:\n%r" % line)
+        # Find the seq line(s)
+        seq_len = 0
+        while line:
+            line = handle.readline()
+            data += line
+            if line.startswith(b"+"):
+                break
+            seq_len += len(line.strip())
+        if not line:
+            raise ValueError("Premature end of file in seq section")
+        assert line[0:1] == b"+"
+        # Find the qual line(s)
+        qual_len = 0
+        while line:
+            if seq_len == qual_len:
+                if seq_len == 0:
+                    # Special case, quality line should be just "\n"
+                    line = handle.readline()
+                    if line.strip():
+                        raise ValueError("Expected blank quality line, not %r" % line)
+                    data += line
+                # Should be end of record...
+                line = handle.readline()
+                if line and line[0:1] != b"@":
+                    raise ValueError("Problem with line %r" % line)
+                break
+            else:
+                line = handle.readline()
+                data += line
+                qual_len += len(line.strip())
+        if seq_len != qual_len:
+            raise ValueError("Problem with quality section")
+        return data
+
+
+###############################################################################
+
+_FormatToRandomAccess = {
+    "ace": SequentialSeqFileRandomAccess,
+    "embl": EmblRandomAccess,
+    "fasta": SequentialSeqFileRandomAccess,
+    "fastq": FastqRandomAccess,  # Class handles all three variants
+    "fastq-sanger": FastqRandomAccess,  # alias of the above
+    "fastq-solexa": FastqRandomAccess,
+    "fastq-illumina": FastqRandomAccess,
+    "genbank": GenBankRandomAccess,
+    "gb": GenBankRandomAccess,  # alias of the above
+    "ig": IntelliGeneticsRandomAccess,
+    "imgt": EmblRandomAccess,
+    "phd": SequentialSeqFileRandomAccess,
+    "pir": SequentialSeqFileRandomAccess,
+    "sff": SffRandomAccess,
+    "sff-trim": SffTrimedRandomAccess,
+    "swiss": SwissRandomAccess,
+    "tab": TabRandomAccess,
+    "qual": SequentialSeqFileRandomAccess,
+    "uniprot-xml": UniprotRandomAccess,
+}
diff --git a/code/lib/Bio/SeqIO/_twoBitIO.c b/code/lib/Bio/SeqIO/_twoBitIO.c
new file mode 100644
index 0000000..6bfa373
--- /dev/null
+++ b/code/lib/Bio/SeqIO/_twoBitIO.c
@@ -0,0 +1,480 @@
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+
+
+static const char bases[][4] = {"TTTT",  /* 00 00 00 00 */
+                                "TTTC",  /* 00 00 00 01 */
+                                "TTTA",  /* 00 00 00 10 */
+                                "TTTG",  /* 00 00 00 11 */
+                                "TTCT",  /* 00 00 01 00 */
+                                "TTCC",  /* 00 00 01 01 */
+                                "TTCA",  /* 00 00 01 10 */
+                                "TTCG",  /* 00 00 01 11 */
+                                "TTAT",  /* 00 00 10 00 */
+                                "TTAC",  /* 00 00 10 01 */
+                                "TTAA",  /* 00 00 10 10 */
+                                "TTAG",  /* 00 00 10 11 */
+                                "TTGT",  /* 00 00 11 00 */
+                                "TTGC",  /* 00 00 11 01 */
+                                "TTGA",  /* 00 00 11 10 */
+                                "TTGG",  /* 00 00 11 11 */
+                                "TCTT",  /* 00 01 00 00 */
+                                "TCTC",  /* 00 01 00 01 */
+                                "TCTA",  /* 00 01 00 10 */
+                                "TCTG",  /* 00 01 00 11 */
+                                "TCCT",  /* 00 01 01 00 */
+                                "TCCC",  /* 00 01 01 01 */
+                                "TCCA",  /* 00 01 01 10 */
+                                "TCCG",  /* 00 01 01 11 */
+                                "TCAT",  /* 00 01 10 00 */
+                                "TCAC",  /* 00 01 10 01 */
+                                "TCAA",  /* 00 01 10 10 */
+                                "TCAG",  /* 00 01 10 11 */
+                                "TCGT",  /* 00 01 11 00 */
+                                "TCGC",  /* 00 01 11 01 */
+                                "TCGA",  /* 00 01 11 10 */
+                                "TCGG",  /* 00 01 11 11 */
+                                "TATT",  /* 00 10 00 00 */
+                                "TATC",  /* 00 10 00 01 */
+                                "TATA",  /* 00 10 00 10 */
+                                "TATG",  /* 00 10 00 11 */
+                                "TACT",  /* 00 10 01 00 */
+                                "TACC",  /* 00 10 01 01 */
+                                "TACA",  /* 00 10 01 10 */
+                                "TACG",  /* 00 10 01 11 */
+                                "TAAT",  /* 00 10 10 00 */
+                                "TAAC",  /* 00 10 10 01 */
+                                "TAAA",  /* 00 10 10 10 */
+                                "TAAG",  /* 00 10 10 11 */
+                                "TAGT",  /* 00 10 11 00 */
+                                "TAGC",  /* 00 10 11 01 */
+                                "TAGA",  /* 00 10 11 10 */
+                                "TAGG",  /* 00 10 11 11 */
+                                "TGTT",  /* 00 11 00 00 */
+                                "TGTC",  /* 00 11 00 01 */
+                                "TGTA",  /* 00 11 00 10 */
+                                "TGTG",  /* 00 11 00 11 */
+                                "TGCT",  /* 00 11 01 00 */
+                                "TGCC",  /* 00 11 01 01 */
+                                "TGCA",  /* 00 11 01 10 */
+                                "TGCG",  /* 00 11 01 11 */
+                                "TGAT",  /* 00 11 10 00 */
+                                "TGAC",  /* 00 11 10 01 */
+                                "TGAA",  /* 00 11 10 10 */
+                                "TGAG",  /* 00 11 10 11 */
+                                "TGGT",  /* 00 11 11 00 */
+                                "TGGC",  /* 00 11 11 01 */
+                                "TGGA",  /* 00 11 11 10 */
+                                "TGGG",  /* 00 11 11 11 */
+                                "CTTT",  /* 01 00 00 00 */
+                                "CTTC",  /* 01 00 00 01 */
+                                "CTTA",  /* 01 00 00 10 */
+                                "CTTG",  /* 01 00 00 11 */
+                                "CTCT",  /* 01 00 01 00 */
+                                "CTCC",  /* 01 00 01 01 */
+                                "CTCA",  /* 01 00 01 10 */
+                                "CTCG",  /* 01 00 01 11 */
+                                "CTAT",  /* 01 00 10 00 */
+                                "CTAC",  /* 01 00 10 01 */
+                                "CTAA",  /* 01 00 10 10 */
+                                "CTAG",  /* 01 00 10 11 */
+                                "CTGT",  /* 01 00 11 00 */
+                                "CTGC",  /* 01 00 11 01 */
+                                "CTGA",  /* 01 00 11 10 */
+                                "CTGG",  /* 01 00 11 11 */
+                                "CCTT",  /* 01 01 00 00 */
+                                "CCTC",  /* 01 01 00 01 */
+                                "CCTA",  /* 01 01 00 10 */
+                                "CCTG",  /* 01 01 00 11 */
+                                "CCCT",  /* 01 01 01 00 */
+                                "CCCC",  /* 01 01 01 01 */
+                                "CCCA",  /* 01 01 01 10 */
+                                "CCCG",  /* 01 01 01 11 */
+                                "CCAT",  /* 01 01 10 00 */
+                                "CCAC",  /* 01 01 10 01 */
+                                "CCAA",  /* 01 01 10 10 */
+                                "CCAG",  /* 01 01 10 11 */
+                                "CCGT",  /* 01 01 11 00 */
+                                "CCGC",  /* 01 01 11 01 */
+                                "CCGA",  /* 01 01 11 10 */
+                                "CCGG",  /* 01 01 11 11 */
+                                "CATT",  /* 01 10 00 00 */
+                                "CATC",  /* 01 10 00 01 */
+                                "CATA",  /* 01 10 00 10 */
+                                "CATG",  /* 01 10 00 11 */
+                                "CACT",  /* 01 10 01 00 */
+                                "CACC",  /* 01 10 01 01 */
+                                "CACA",  /* 01 10 01 10 */
+                                "CACG",  /* 01 10 01 11 */
+                                "CAAT",  /* 01 10 10 00 */
+                                "CAAC",  /* 01 10 10 01 */
+                                "CAAA",  /* 01 10 10 10 */
+                                "CAAG",  /* 01 10 10 11 */
+                                "CAGT",  /* 01 10 11 00 */
+                                "CAGC",  /* 01 10 11 01 */
+                                "CAGA",  /* 01 10 11 10 */
+                                "CAGG",  /* 01 10 11 11 */
+                                "CGTT",  /* 01 11 00 00 */
+                                "CGTC",  /* 01 11 00 01 */
+                                "CGTA",  /* 01 11 00 10 */
+                                "CGTG",  /* 01 11 00 11 */
+                                "CGCT",  /* 01 11 01 00 */
+                                "CGCC",  /* 01 11 01 01 */
+                                "CGCA",  /* 01 11 01 10 */
+                                "CGCG",  /* 01 11 01 11 */
+                                "CGAT",  /* 01 11 10 00 */
+                                "CGAC",  /* 01 11 10 01 */
+                                "CGAA",  /* 01 11 10 10 */
+                                "CGAG",  /* 01 11 10 11 */
+                                "CGGT",  /* 01 11 11 00 */
+                                "CGGC",  /* 01 11 11 01 */
+                                "CGGA",  /* 01 11 11 10 */
+                                "CGGG",  /* 01 11 11 11 */
+                                "ATTT",  /* 10 00 00 00 */
+                                "ATTC",  /* 10 00 00 01 */
+                                "ATTA",  /* 10 00 00 10 */
+                                "ATTG",  /* 10 00 00 11 */
+                                "ATCT",  /* 10 00 01 00 */
+                                "ATCC",  /* 10 00 01 01 */
+                                "ATCA",  /* 10 00 01 10 */
+                                "ATCG",  /* 10 00 01 11 */
+                                "ATAT",  /* 10 00 10 00 */
+                                "ATAC",  /* 10 00 10 01 */
+                                "ATAA",  /* 10 00 10 10 */
+                                "ATAG",  /* 10 00 10 11 */
+                                "ATGT",  /* 10 00 11 00 */
+                                "ATGC",  /* 10 00 11 01 */
+                                "ATGA",  /* 10 00 11 10 */
+                                "ATGG",  /* 10 00 11 11 */
+                                "ACTT",  /* 10 01 00 00 */
+                                "ACTC",  /* 10 01 00 01 */
+                                "ACTA",  /* 10 01 00 10 */
+                                "ACTG",  /* 10 01 00 11 */
+                                "ACCT",  /* 10 01 01 00 */
+                                "ACCC",  /* 10 01 01 01 */
+                                "ACCA",  /* 10 01 01 10 */
+                                "ACCG",  /* 10 01 01 11 */
+                                "ACAT",  /* 10 01 10 00 */
+                                "ACAC",  /* 10 01 10 01 */
+                                "ACAA",  /* 10 01 10 10 */
+                                "ACAG",  /* 10 01 10 11 */
+                                "ACGT",  /* 10 01 11 00 */
+                                "ACGC",  /* 10 01 11 01 */
+                                "ACGA",  /* 10 01 11 10 */
+                                "ACGG",  /* 10 01 11 11 */
+                                "AATT",  /* 10 10 00 00 */
+                                "AATC",  /* 10 10 00 01 */
+                                "AATA",  /* 10 10 00 10 */
+                                "AATG",  /* 10 10 00 11 */
+                                "AACT",  /* 10 10 01 00 */
+                                "AACC",  /* 10 10 01 01 */
+                                "AACA",  /* 10 10 01 10 */
+                                "AACG",  /* 10 10 01 11 */
+                                "AAAT",  /* 10 10 10 00 */
+                                "AAAC",  /* 10 10 10 01 */
+                                "AAAA",  /* 10 10 10 10 */
+                                "AAAG",  /* 10 10 10 11 */
+                                "AAGT",  /* 10 10 11 00 */
+                                "AAGC",  /* 10 10 11 01 */
+                                "AAGA",  /* 10 10 11 10 */
+                                "AAGG",  /* 10 10 11 11 */
+                                "AGTT",  /* 10 11 00 00 */
+                                "AGTC",  /* 10 11 00 01 */
+                                "AGTA",  /* 10 11 00 10 */
+                                "AGTG",  /* 10 11 00 11 */
+                                "AGCT",  /* 10 11 01 00 */
+                                "AGCC",  /* 10 11 01 01 */
+                                "AGCA",  /* 10 11 01 10 */
+                                "AGCG",  /* 10 11 01 11 */
+                                "AGAT",  /* 10 11 10 00 */
+                                "AGAC",  /* 10 11 10 01 */
+                                "AGAA",  /* 10 11 10 10 */
+                                "AGAG",  /* 10 11 10 11 */
+                                "AGGT",  /* 10 11 11 00 */
+                                "AGGC",  /* 10 11 11 01 */
+                                "AGGA",  /* 10 11 11 10 */
+                                "AGGG",  /* 10 11 11 11 */
+                                "GTTT",  /* 11 00 00 00 */
+                                "GTTC",  /* 11 00 00 01 */
+                                "GTTA",  /* 11 00 00 10 */
+                                "GTTG",  /* 11 00 00 11 */
+                                "GTCT",  /* 11 00 01 00 */
+                                "GTCC",  /* 11 00 01 01 */
+                                "GTCA",  /* 11 00 01 10 */
+                                "GTCG",  /* 11 00 01 11 */
+                                "GTAT",  /* 11 00 10 00 */
+                                "GTAC",  /* 11 00 10 01 */
+                                "GTAA",  /* 11 00 10 10 */
+                                "GTAG",  /* 11 00 10 11 */
+                                "GTGT",  /* 11 00 11 00 */
+                                "GTGC",  /* 11 00 11 01 */
+                                "GTGA",  /* 11 00 11 10 */
+                                "GTGG",  /* 11 00 11 11 */
+                                "GCTT",  /* 11 01 00 00 */
+                                "GCTC",  /* 11 01 00 01 */
+                                "GCTA",  /* 11 01 00 10 */
+                                "GCTG",  /* 11 01 00 11 */
+                                "GCCT",  /* 11 01 01 00 */
+                                "GCCC",  /* 11 01 01 01 */
+                                "GCCA",  /* 11 01 01 10 */
+                                "GCCG",  /* 11 01 01 11 */
+                                "GCAT",  /* 11 01 10 00 */
+                                "GCAC",  /* 11 01 10 01 */
+                                "GCAA",  /* 11 01 10 10 */
+                                "GCAG",  /* 11 01 10 11 */
+                                "GCGT",  /* 11 01 11 00 */
+                                "GCGC",  /* 11 01 11 01 */
+                                "GCGA",  /* 11 01 11 10 */
+                                "GCGG",  /* 11 01 11 11 */
+                                "GATT",  /* 11 10 00 00 */
+                                "GATC",  /* 11 10 00 01 */
+                                "GATA",  /* 11 10 00 10 */
+                                "GATG",  /* 11 10 00 11 */
+                                "GACT",  /* 11 10 01 00 */
+                                "GACC",  /* 11 10 01 01 */
+                                "GACA",  /* 11 10 01 10 */
+                                "GACG",  /* 11 10 01 11 */
+                                "GAAT",  /* 11 10 10 00 */
+                                "GAAC",  /* 11 10 10 01 */
+                                "GAAA",  /* 11 10 10 10 */
+                                "GAAG",  /* 11 10 10 11 */
+                                "GAGT",  /* 11 10 11 00 */
+                                "GAGC",  /* 11 10 11 01 */
+                                "GAGA",  /* 11 10 11 10 */
+                                "GAGG",  /* 11 10 11 11 */
+                                "GGTT",  /* 11 11 00 00 */
+                                "GGTC",  /* 11 11 00 01 */
+                                "GGTA",  /* 11 11 00 10 */
+                                "GGTG",  /* 11 11 00 11 */
+                                "GGCT",  /* 11 11 01 00 */
+                                "GGCC",  /* 11 11 01 01 */
+                                "GGCA",  /* 11 11 01 10 */
+                                "GGCG",  /* 11 11 01 11 */
+                                "GGAT",  /* 11 11 10 00 */
+                                "GGAC",  /* 11 11 10 01 */
+                                "GGAA",  /* 11 11 10 10 */
+                                "GGAG",  /* 11 11 10 11 */
+                                "GGGT",  /* 11 11 11 00 */
+                                "GGGC",  /* 11 11 11 01 */
+                                "GGGA",  /* 11 11 11 10 */
+                                "GGGG",  /* 11 11 11 11 */
+                               };
+
+static int
+extract(const unsigned char* bytes, uint32_t byteSize, uint32_t start, uint32_t end, char sequence[]) {
+    uint32_t i;
+    const uint32_t size = end - start;
+    const uint32_t byteStart = start / 4;
+    const uint32_t byteEnd = (end + 3) / 4;
+
+    if (byteSize != byteEnd - byteStart) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "unexpected number of bytes %u (expected %u)",
+                     byteSize, byteEnd - byteStart);
+        return -1;
+    }
+
+    start -= byteStart * 4;
+    if (byteStart + 1 == byteEnd) {
+        /* one byte only */
+        memcpy(sequence, &(bases[*bytes][start]), size);
+    }
+    else {
+        end -= byteEnd * 4;
+        /* end is now a negative number equal to the distance to the byte end */
+        memcpy(sequence, &(bases[*bytes][start]), 4 - start);
+        bytes++;
+        sequence += (4 - start);
+        for (i = byteStart+1; i < byteEnd-1; i++, bytes++, sequence += 4)
+            memcpy(sequence, bases[*bytes], 4);
+        memcpy(sequence, bases[*bytes], end + 4);
+        bytes++;
+        bytes -= byteSize;
+    }
+    return 0;
+}
+
+static void
+applyNs(char sequence[], uint32_t start, uint32_t end, Py_buffer *nBlocks)
+{
+    const Py_ssize_t nBlockCount = nBlocks->shape[0];
+    const uint32_t* const nBlockPositions = nBlocks->buf;
+
+    Py_ssize_t i;
+    for (i = 0; i < nBlockCount; i++) {
+        uint32_t nBlockStart = nBlockPositions[2*i];
+        uint32_t nBlockEnd = nBlockPositions[2*i+1];
+        if (nBlockEnd < start) continue;
+        if (end < nBlockStart) break;
+        if (nBlockStart < start) nBlockStart = start;
+        if (end < nBlockEnd) nBlockEnd = end;
+        memset(sequence + nBlockStart - start, 'N', nBlockEnd - nBlockStart);
+    }
+}
+
+static void
+applyMask(char sequence[], uint32_t start, uint32_t end, Py_buffer* maskBlocks)
+{
+    const Py_ssize_t maskBlockCount = maskBlocks->shape[0];
+    const uint32_t* const maskBlockPositions = maskBlocks->buf;
+    const char diff = 'a' - 'A';
+
+    Py_ssize_t i;
+    for (i = 0; i < maskBlockCount; i++) {
+        uint32_t j;
+        uint32_t maskBlockStart = maskBlockPositions[2*i];
+        uint32_t maskBlockEnd = maskBlockPositions[2*i+1];
+        if (maskBlockEnd < start) continue;
+        if (end < maskBlockStart) break;
+        if (maskBlockStart < start) maskBlockStart = start;
+        if (end < maskBlockEnd) maskBlockEnd = end;
+        for (j = maskBlockStart - start; j < maskBlockEnd - start; j++)
+            sequence[j] += diff;
+    }
+}
+
+static int
+blocks_converter(PyObject* object, void* pointer)
+{
+    const int flag = PyBUF_ND | PyBUF_FORMAT;
+    Py_buffer *view = pointer;
+
+    if (object == NULL) goto exit;
+
+    if (PyObject_GetBuffer(object, view, flag) == -1) {
+        PyErr_SetString(PyExc_RuntimeError, "blocks have unexpected format.");
+        return 0;
+    }
+
+    if (view->itemsize != sizeof(uint32_t)
+     || (strcmp(view->format, "I") != 0 && strcmp(view->format, "L") != 0 )) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "blocks have incorrect data type (itemsize %zd, format %s)",
+                     view->itemsize, view->format);
+        goto exit;
+    }
+    if (view->ndim != 2) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "blocks have incorrect rank %d (expected 2)", view->ndim);
+        goto exit;
+    }
+    if (view->shape[1] != 2) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "blocks should have two colums (found %zd)",
+                     view->shape[1]);
+        goto exit;
+    }
+    return Py_CLEANUP_SUPPORTED;
+
+exit:
+    PyBuffer_Release(view);
+    return 0;
+}
+
+static char TwoBit_convert__doc__[] = "convert twoBit data to the DNA sequence, apply blocks of N's (representing unknown sequences) and masked (lower case) blocks, and return the sequence as a bytes object";
+
+static PyObject*
+TwoBit_convert(PyObject* self, PyObject* args, PyObject* keywords)
+{
+    const unsigned char *data;
+    Py_ssize_t start;
+    Py_ssize_t end;
+    Py_ssize_t step;
+    Py_ssize_t size;
+    Py_ssize_t length;
+    Py_buffer nBlocks;
+    Py_buffer maskBlocks;
+    PyObject *object;
+    char *sequence;
+
+    static char* kwlist[] = {"data", "start", "end", "step",
+                             "nBlocks", "maskBlocks", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#nnnO&O&", kwlist,
+                                     &data, &length, &start, &end, &step,
+                                     &blocks_converter, &nBlocks,
+                                     &blocks_converter, &maskBlocks))
+        return NULL;
+
+    size = (end - start) / step;
+    object = PyBytes_FromStringAndSize(NULL, size);
+    if (!object) goto exit;
+
+    sequence = PyBytes_AS_STRING(object);
+
+    if (step == 1) {
+        if (extract(data, length, start, end, sequence) < 0) {
+            Py_DECREF(object);
+            object = NULL;
+            goto exit;
+        }
+        applyNs(sequence, start, end, &nBlocks);
+        applyMask(sequence, start, end, &maskBlocks);
+    }
+    else {
+        Py_ssize_t current, i;
+        Py_ssize_t full_start, full_end;
+        char* full_sequence;
+        if (start <= end) {
+            full_start = start;
+            full_end = end;
+            current = 0; /* first position in sequence */
+        }
+        else {
+            full_start = end + 1;
+            full_end = start + 1;
+            current = start - end - 1; /* last position in sequence */
+        }
+        full_sequence = PyMem_Malloc((full_end-full_start+1)*sizeof(char));
+        full_sequence[full_end-full_start] = '\0';
+        if (!full_sequence) {
+            Py_DECREF(object);
+            object = NULL;
+            goto exit;
+        }
+        if (extract(data, length, full_start, full_end, full_sequence) < 0) {
+            PyMem_Free(full_sequence);
+            Py_DECREF(object);
+            object = NULL;
+            goto exit;
+        }
+        applyNs(full_sequence, full_start, full_end, &nBlocks);
+        applyMask(full_sequence, full_start, full_end, &maskBlocks);
+        for (i = 0; i < size; current += step, i++)
+            sequence[i] = full_sequence[current];
+        PyMem_Free(full_sequence);
+    }
+
+exit:
+    blocks_converter(NULL, &nBlocks);
+    blocks_converter(NULL, &maskBlocks);
+    return object;
+}
+
+static struct PyMethodDef _twoBitIO_methods[] = {
+    {"convert",
+     (PyCFunction)TwoBit_convert,
+     METH_VARARGS | METH_KEYWORDS,
+     TwoBit_convert__doc__
+    },
+    {NULL,          NULL, 0, NULL} /* sentinel */
+};
+
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_twoBitIO",
+    "Parser for DNA sequence data in 2bit format",
+    -1,
+    _twoBitIO_methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+PyObject *
+PyInit__twoBitIO(void)
+{
+    return PyModule_Create(&moduledef);
+}
diff --git a/code/lib/Bio/SeqIO/_twoBitIO.cp37-win_amd64.pyd b/code/lib/Bio/SeqIO/_twoBitIO.cp37-win_amd64.pyd
new file mode 100644
index 0000000..244dc40
Binary files /dev/null and b/code/lib/Bio/SeqIO/_twoBitIO.cp37-win_amd64.pyd differ
diff --git a/code/lib/Bio/SeqRecord.py b/code/lib/Bio/SeqRecord.py
new file mode 100644
index 0000000..c22b5b5
--- /dev/null
+++ b/code/lib/Bio/SeqRecord.py
@@ -0,0 +1,1372 @@
+# Copyright 2000-2002 Andrew Dalke.  All rights reserved.
+# Copyright 2002-2004 Brad Chapman.  All rights reserved.
+# Copyright 2006-2020 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Represent a Sequence Record, a sequence with annotation."""
+# NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL
+# In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes
+# need to be in sync (this is the BioSQL "Database SeqRecord").
+from io import StringIO
+
+from Bio import StreamModeError
+from Bio.Seq import UndefinedSequenceError
+
+
+_NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest."
+
+
+class _RestrictedDict(dict):
+    """Dict which only allows sequences of given length as values (PRIVATE).
+
+    This simple subclass of the Python dictionary is used in the SeqRecord
+    object for holding per-letter-annotations.  This class is intended to
+    prevent simple errors by only allowing python sequences (e.g. lists,
+    strings and tuples) to be stored, and only if their length matches that
+    expected (the length of the SeqRecord's seq object).  It cannot however
+    prevent the entries being edited in situ (for example appending entries
+    to a list).
+
+    >>> x = _RestrictedDict(5)
+    >>> x["test"] = "hello"
+    >>> x
+    {'test': 'hello'}
+
+    Adding entries which don't have the expected length are blocked:
+
+    >>> x["test"] = "hello world"
+    Traceback (most recent call last):
+    ...
+    TypeError: We only allow python sequences (lists, tuples or strings) of length 5.
+
+    The expected length is stored as a private attribute,
+
+    >>> x._length
+    5
+
+    In order that the SeqRecord (and other objects using this class) can be
+    pickled, for example for use in the multiprocessing library, we need to
+    be able to pickle the restricted dictionary objects.
+
+    Using the default protocol, which is 3 on Python 3,
+
+    >>> import pickle
+    >>> y = pickle.loads(pickle.dumps(x))
+    >>> y
+    {'test': 'hello'}
+    >>> y._length
+    5
+
+    Using the highest protocol, which is 4 on Python 3,
+
+    >>> import pickle
+    >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL))
+    >>> z
+    {'test': 'hello'}
+    >>> z._length
+    5
+    """
+
+    def __init__(self, length):
+        """Create an EMPTY restricted dictionary."""
+        dict.__init__(self)
+        self._length = int(length)
+
+    def __setitem__(self, key, value):
+        # The check hasattr(self, "_length") is to cope with pickle protocol 2
+        # I couldn't seem to avoid this with __getstate__ and __setstate__
+        if (
+            not hasattr(value, "__len__")
+            or not hasattr(value, "__getitem__")
+            or (hasattr(self, "_length") and len(value) != self._length)
+        ):
+            raise TypeError(
+                "We only allow python sequences (lists, tuples or strings) "
+                f"of length {self._length}."
+            )
+        dict.__setitem__(self, key, value)
+
+    def update(self, new_dict):
+        # Force this to go via our strict __setitem__ method
+        for (key, value) in new_dict.items():
+            self[key] = value
+
+
+class SeqRecord:
+    """A SeqRecord object holds a sequence and information about it.
+
+    Main attributes:
+     - id          - Identifier such as a locus tag (string)
+     - seq         - The sequence itself (Seq object or similar)
+
+    Additional attributes:
+     - name        - Sequence name, e.g. gene name (string)
+     - description - Additional text (string)
+     - dbxrefs     - List of database cross references (list of strings)
+     - features    - Any (sub)features defined (list of SeqFeature objects)
+     - annotations - Further information about the whole sequence (dictionary).
+       Most entries are strings, or lists of strings.
+     - letter_annotations - Per letter/symbol annotation (restricted
+       dictionary). This holds Python sequences (lists, strings
+       or tuples) whose length matches that of the sequence.
+       A typical use would be to hold a list of integers
+       representing sequencing quality scores, or a string
+       representing the secondary structure.
+
+    You will typically use Bio.SeqIO to read in sequences from files as
+    SeqRecord objects.  However, you may want to create your own SeqRecord
+    objects directly (see the __init__ method for further details):
+
+    >>> from Bio.Seq import Seq
+    >>> from Bio.SeqRecord import SeqRecord
+    >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
+    ...                    id="YP_025292.1", name="HokC",
+    ...                    description="toxic membrane protein")
+    >>> print(record)
+    ID: YP_025292.1
+    Name: HokC
+    Description: toxic membrane protein
+    Number of features: 0
+    Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
+
+    If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
+    for this.  For the special case where you want the SeqRecord turned into
+    a string in a particular file format there is a format method which uses
+    Bio.SeqIO internally:
+
+    >>> print(record.format("fasta"))
+    >YP_025292.1 toxic membrane protein
+    MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+    
+
+    You can also do things like slicing a SeqRecord, checking its length, etc
+
+    >>> len(record)
+    44
+    >>> edited = record[:10] + record[11:]
+    >>> print(edited.seq)
+    MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+    >>> print(record.seq)
+    MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+
+    """
+
+    def __init__(
+        self,
+        seq,
+        id="",
+        name="",
+        description="",
+        dbxrefs=None,
+        features=None,
+        annotations=None,
+        letter_annotations=None,
+    ):
+        """Create a SeqRecord.
+
+        Arguments:
+         - seq         - Sequence, required (Seq or MutableSeq)
+         - id          - Sequence identifier, recommended (string)
+         - name        - Sequence name, optional (string)
+         - description - Sequence description, optional (string)
+         - dbxrefs     - Database cross references, optional (list of strings)
+         - features    - Any (sub)features, optional (list of SeqFeature objects)
+         - annotations - Dictionary of annotations for the whole sequence
+         - letter_annotations - Dictionary of per-letter-annotations, values
+           should be strings, list or tuples of the same length as the full
+           sequence.
+
+        You will typically use Bio.SeqIO to read in sequences from files as
+        SeqRecord objects.  However, you may want to create your own SeqRecord
+        objects directly.
+
+        Note that while an id is optional, we strongly recommend you supply a
+        unique id string for each record.  This is especially important
+        if you wish to write your sequences to a file.
+
+        You can create a 'blank' SeqRecord object, and then populate the
+        attributes later.
+        """
+        if id is not None and not isinstance(id, str):
+            # Lots of existing code uses id=None... this may be a bad idea.
+            raise TypeError("id argument should be a string")
+        if not isinstance(name, str):
+            raise TypeError("name argument should be a string")
+        if not isinstance(description, str):
+            raise TypeError("description argument should be a string")
+        self._seq = seq
+        self.id = id
+        self.name = name
+        self.description = description
+
+        # database cross references (for the whole sequence)
+        if dbxrefs is None:
+            dbxrefs = []
+        elif not isinstance(dbxrefs, list):
+            raise TypeError("dbxrefs argument should be a list (of strings)")
+        self.dbxrefs = dbxrefs
+
+        # annotations about the whole sequence
+        if annotations is None:
+            annotations = {}
+        elif not isinstance(annotations, dict):
+            raise TypeError("annotations argument should be a dict")
+        self.annotations = annotations
+
+        if letter_annotations is None:
+            # annotations about each letter in the sequence
+            if seq is None:
+                # Should we allow this and use a normal unrestricted dict?
+                self._per_letter_annotations = _RestrictedDict(length=0)
+            else:
+                try:
+                    self._per_letter_annotations = _RestrictedDict(length=len(seq))
+                except TypeError:
+                    raise TypeError(
+                        "seq argument should be a Seq object or similar"
+                    ) from None
+        else:
+            # This will be handled via the property set function, which will
+            # turn this into a _RestrictedDict and thus ensure all the values
+            # in the dict are the right length
+            self.letter_annotations = letter_annotations
+
+        # annotations about parts of the sequence
+        if features is None:
+            features = []
+        elif not isinstance(features, list):
+            raise TypeError(
+                "features argument should be a list (of SeqFeature objects)"
+            )
+        self.features = features
+
+    # TODO - Just make this a read only property?
+    def _set_per_letter_annotations(self, value):
+        if not isinstance(value, dict):
+            raise TypeError(
+                "The per-letter-annotations should be a (restricted) dictionary."
+            )
+        # Turn this into a restricted-dictionary (and check the entries)
+        try:
+            self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
+        except AttributeError:
+            # e.g. seq is None
+            self._per_letter_annotations = _RestrictedDict(length=0)
+        self._per_letter_annotations.update(value)
+
+    letter_annotations = property(
+        fget=lambda self: self._per_letter_annotations,
+        fset=_set_per_letter_annotations,
+        doc="""Dictionary of per-letter-annotation for the sequence.
+
+        For example, this can hold quality scores used in FASTQ or QUAL files.
+        Consider this example using Bio.SeqIO to read in an example Solexa
+        variant FASTQ file as a SeqRecord:
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
+        >>> print("%s %s" % (record.id, record.seq))
+        slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+        >>> print(list(record.letter_annotations))
+        ['solexa_quality']
+        >>> print(record.letter_annotations["solexa_quality"])
+        [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
+
+        The letter_annotations get sliced automatically if you slice the
+        parent SeqRecord, for example taking the last ten bases:
+
+        >>> sub_record = record[-10:]
+        >>> print("%s %s" % (sub_record.id, sub_record.seq))
+        slxa_0001_1_0001_01 ACGTNNNNNN
+        >>> print(sub_record.letter_annotations["solexa_quality"])
+        [4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
+
+        Any python sequence (i.e. list, tuple or string) can be recorded in
+        the SeqRecord's letter_annotations dictionary as long as the length
+        matches that of the SeqRecord's sequence.  e.g.
+
+        >>> len(sub_record.letter_annotations)
+        1
+        >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
+        >>> len(sub_record.letter_annotations)
+        2
+
+        You can delete entries from the letter_annotations dictionary as usual:
+
+        >>> del sub_record.letter_annotations["solexa_quality"]
+        >>> sub_record.letter_annotations
+        {'dummy': 'abcdefghij'}
+
+        You can completely clear the dictionary easily as follows:
+
+        >>> sub_record.letter_annotations = {}
+        >>> sub_record.letter_annotations
+        {}
+
+        Note that if replacing the record's sequence with a sequence of a
+        different length you must first clear the letter_annotations dict.
+        """,
+    )
+
+    def _set_seq(self, value):
+        # TODO - Add a deprecation warning that the seq should be write only?
+        if self._per_letter_annotations:
+            if len(self) != len(value):
+                # TODO - Make this a warning? Silently empty the dictionary?
+                raise ValueError("You must empty the letter annotations first!")
+            else:
+                # Leave the existing per letter annotations unchanged:
+                self._seq = value
+        else:
+            self._seq = value
+            # Reset the (empty) letter annotations dict with new length:
+            try:
+                self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
+            except AttributeError:
+                # e.g. seq is None
+                self._per_letter_annotations = _RestrictedDict(length=0)
+
+    seq = property(
+        fget=lambda self: self._seq,
+        fset=_set_seq,
+        doc="The sequence itself, as a Seq or MutableSeq object.",
+    )
+
+    def __getitem__(self, index):
+        """Return a sub-sequence or an individual letter.
+
+        Slicing, e.g. my_record[5:10], returns a new SeqRecord for
+        that sub-sequence with some annotation preserved as follows:
+
+        * The name, id and description are kept as-is.
+        * Any per-letter-annotations are sliced to match the requested
+          sub-sequence.
+        * Unless a stride is used, all those features which fall fully
+          within the subsequence are included (with their locations
+          adjusted accordingly). If you want to preserve any truncated
+          features (e.g. GenBank/EMBL source features), you must
+          explicitly add them to the new SeqRecord yourself.
+        * With the exception of any molecule type, the annotations
+          dictionary and the dbxrefs list are not used for the new
+          SeqRecord, as in general they may not apply to the
+          subsequence. If you want to preserve them, you must explicitly
+          copy them to the new SeqRecord yourself.
+
+        Using an integer index, e.g. my_record[5] is shorthand for
+        extracting that letter from the sequence, my_record.seq[5].
+
+        For example, consider this short protein and its secondary
+        structure as encoded by the PDB (e.g. H for alpha helices),
+        plus a simple feature for its histidine self phosphorylation
+        site:
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+        >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
+        ...                     "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR"),
+        ...                 id="1JOY", name="EnvZ",
+        ...                 description="Homodimeric domain of EnvZ from E. coli")
+        >>> rec.letter_annotations["secondary_structure"] = "  S  SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT  "
+        >>> rec.features.append(SeqFeature(FeatureLocation(20, 21),
+        ...                     type = "Site"))
+
+        Now let's have a quick look at the full record,
+
+        >>> print(rec)
+        ID: 1JOY
+        Name: EnvZ
+        Description: Homodimeric domain of EnvZ from E. coli
+        Number of features: 1
+        Per letter annotation for: secondary_structure
+        Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR')
+        >>> rec.letter_annotations["secondary_structure"]
+        '  S  SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT  '
+        >>> print(rec.features[0].location)
+        [20:21]
+
+        Now let's take a sub sequence, here chosen as the first (fractured)
+        alpha helix which includes the histidine phosphorylation site:
+
+        >>> sub = rec[11:41]
+        >>> print(sub)
+        ID: 1JOY
+        Name: EnvZ
+        Description: Homodimeric domain of EnvZ from E. coli
+        Number of features: 1
+        Per letter annotation for: secondary_structure
+        Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD')
+        >>> sub.letter_annotations["secondary_structure"]
+        'HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH'
+        >>> print(sub.features[0].location)
+        [9:10]
+
+        You can also of course omit the start or end values, for
+        example to get the first ten letters only:
+
+        >>> print(rec[:10])
+        ID: 1JOY
+        Name: EnvZ
+        Description: Homodimeric domain of EnvZ from E. coli
+        Number of features: 0
+        Per letter annotation for: secondary_structure
+        Seq('MAAGVKQLAD')
+
+        Or for the last ten letters:
+
+        >>> print(rec[-10:])
+        ID: 1JOY
+        Name: EnvZ
+        Description: Homodimeric domain of EnvZ from E. coli
+        Number of features: 0
+        Per letter annotation for: secondary_structure
+        Seq('IIEQFIDYLR')
+
+        If you omit both, then you get a copy of the original record (although
+        lacking the annotations and dbxrefs):
+
+        >>> print(rec[:])
+        ID: 1JOY
+        Name: EnvZ
+        Description: Homodimeric domain of EnvZ from E. coli
+        Number of features: 1
+        Per letter annotation for: secondary_structure
+        Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR')
+
+        Finally, indexing with a simple integer is shorthand for pulling out
+        that letter from the sequence directly:
+
+        >>> rec[5]
+        'K'
+        >>> rec.seq[5]
+        'K'
+        """
+        if isinstance(index, int):
+            # NOTE - The sequence level annotation like the id, name, etc
+            # do not really apply to a single character.  However, should
+            # we try and expose any per-letter-annotation here?  If so how?
+            return self.seq[index]
+        elif isinstance(index, slice):
+            if self.seq is None:
+                raise ValueError("If the sequence is None, we cannot slice it.")
+            parent_length = len(self)
+            try:
+                from BioSQL.BioSeq import DBSeqRecord
+
+                biosql_available = True
+            except ImportError:
+                biosql_available = False
+
+            if biosql_available and isinstance(self, DBSeqRecord):
+                answer = SeqRecord(
+                    self.seq[index],
+                    id=self.id,
+                    name=self.name,
+                    description=self.description,
+                )
+            else:
+                answer = self.__class__(
+                    self.seq[index],
+                    id=self.id,
+                    name=self.name,
+                    description=self.description,
+                )
+            # TODO - The description may no longer apply.
+            # It would be safer to change it to something
+            # generic like "edited" or the default value.
+
+            # Don't copy the annotation dict and dbxefs list,
+            # they may not apply to a subsequence.
+            # answer.annotations = dict(self.annotations.items())
+            # answer.dbxrefs = self.dbxrefs[:]
+            # TODO - Review this in light of adding SeqRecord objects?
+
+            if "molecule_type" in self.annotations:
+                # This will still apply, and we need it for GenBank/EMBL etc output
+                answer.annotations["molecule_type"] = self.annotations["molecule_type"]
+
+            # TODO - Cope with strides by generating ambiguous locations?
+            start, stop, step = index.indices(parent_length)
+            if step == 1:
+                # Select relevant features, add them with shifted locations
+                # assert str(self.seq)[index] == str(self.seq)[start:stop]
+                for f in self.features:
+                    if f.ref or f.ref_db:
+                        # TODO - Implement this (with lots of tests)?
+                        import warnings
+
+                        warnings.warn(
+                            "When slicing SeqRecord objects, any "
+                            "SeqFeature referencing other sequences (e.g. "
+                            "from segmented GenBank records) are ignored."
+                        )
+                        continue
+                    if (
+                        start <= f.location.nofuzzy_start
+                        and f.location.nofuzzy_end <= stop
+                    ):
+                        answer.features.append(f._shift(-start))
+
+            # Slice all the values to match the sliced sequence
+            # (this should also work with strides, even negative strides):
+            for key, value in self.letter_annotations.items():
+                answer._per_letter_annotations[key] = value[index]
+
+            return answer
+        raise ValueError("Invalid index")
+
+    def __iter__(self):
+        """Iterate over the letters in the sequence.
+
+        For example, using Bio.SeqIO to read in a protein FASTA file:
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta")
+        >>> for amino in record:
+        ...     print(amino)
+        ...     if amino == "L": break
+        X
+        A
+        G
+        L
+        >>> print(record.seq[3])
+        L
+
+        This is just a shortcut for iterating over the sequence directly:
+
+        >>> for amino in record.seq:
+        ...     print(amino)
+        ...     if amino == "L": break
+        X
+        A
+        G
+        L
+        >>> print(record.seq[3])
+        L
+
+        Note that this does not facilitate iteration together with any
+        per-letter-annotation.  However, you can achieve that using the
+        python zip function on the record (or its sequence) and the relevant
+        per-letter-annotation:
+
+        >>> from Bio import SeqIO
+        >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
+        >>> print("%s %s" % (rec.id, rec.seq))
+        slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+        >>> print(list(rec.letter_annotations))
+        ['solexa_quality']
+        >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]):
+        ...     if qual > 35:
+        ...         print("%s %i" % (nuc, qual))
+        A 40
+        C 39
+        G 38
+        T 37
+        A 36
+
+        You may agree that using zip(rec.seq, ...) is more explicit than using
+        zip(rec, ...) as shown above.
+        """
+        return iter(self.seq)
+
+    def __contains__(self, char):
+        """Implement the 'in' keyword, searches the sequence.
+
+        e.g.
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta")
+        >>> "GAATTC" in record
+        False
+        >>> "AAA" in record
+        True
+
+        This essentially acts as a proxy for using "in" on the sequence:
+
+        >>> "GAATTC" in record.seq
+        False
+        >>> "AAA" in record.seq
+        True
+
+        Note that you can also use Seq objects as the query,
+
+        >>> from Bio.Seq import Seq
+        >>> Seq("AAA") in record
+        True
+
+        See also the Seq object's __contains__ method.
+        """
+        return char in self.seq
+
+    def __str__(self):
+        """Return a human readable summary of the record and its annotation (string).
+
+        The python built in function str works by calling the object's ___str__
+        method.  e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
+        ...                    id="YP_025292.1", name="HokC",
+        ...                    description="toxic membrane protein, small")
+        >>> print(str(record))
+        ID: YP_025292.1
+        Name: HokC
+        Description: toxic membrane protein, small
+        Number of features: 0
+        Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
+
+        In this example you don't actually need to call str explicity, as the
+        print command does this automatically:
+
+        >>> print(record)
+        ID: YP_025292.1
+        Name: HokC
+        Description: toxic membrane protein, small
+        Number of features: 0
+        Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
+
+        Note that long sequences are shown truncated.
+        """
+        lines = []
+        if self.id:
+            lines.append(f"ID: {self.id}")
+        if self.name:
+            lines.append(f"Name: {self.name}")
+        if self.description:
+            lines.append(f"Description: {self.description}")
+        if self.dbxrefs:
+            lines.append("Database cross-references: " + ", ".join(self.dbxrefs))
+        lines.append(f"Number of features: {len(self.features)}")
+        for a in self.annotations:
+            lines.append(f"/{a}={str(self.annotations[a])}")
+        if self.letter_annotations:
+            lines.append(
+                "Per letter annotation for: " + ", ".join(self.letter_annotations)
+            )
+        try:
+            bytes(self.seq)
+        except UndefinedSequenceError:
+            lines.append(f"Undefined sequence of length {len(self.seq)}")
+        else:
+            # Don't want to include the entire sequence
+            seq = repr(self.seq)
+            lines.append(seq)
+        return "\n".join(lines)
+
+    def __repr__(self):
+        """Return a concise summary of the record for debugging (string).
+
+        The python built in function repr works by calling the object's ___repr__
+        method.  e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
+        ...                     "GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
+        ...                     "SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
+        ...                     "PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF"),
+        ...                 id="NP_418483.1", name="b4059",
+        ...                 description="ssDNA-binding protein",
+        ...                 dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
+        >>> print(repr(rec))
+        SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF'), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
+
+        At the python prompt you can also use this shorthand:
+
+        >>> rec
+        SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF'), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
+
+        Note that long sequences are shown truncated. Also note that any
+        annotations, letter_annotations and features are not shown (as they
+        would lead to a very long string).
+        """
+        return (
+            f"{self.__class__.__name__}(seq={self.seq!r}, id={self.id!r},"
+            f" name={self.name!r}, description={self.description!r},"
+            f" dbxrefs={self.dbxrefs!r})"
+        )
+
+    def format(self, format):
+        r"""Return the record as a string in the specified file format.
+
+        The format should be a lower case string supported as an output
+        format by Bio.SeqIO, which is used to turn the SeqRecord into a
+        string.  e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
+        ...                    id="YP_025292.1", name="HokC",
+        ...                    description="toxic membrane protein")
+        >>> record.format("fasta")
+        '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n'
+        >>> print(record.format("fasta"))
+        >YP_025292.1 toxic membrane protein
+        MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+        
+
+        The Python print function automatically appends a new line, meaning
+        in this example a blank line is shown.  If you look at the string
+        representation you can see there is a trailing new line (shown as
+        slash n) which is important when writing to a file or if
+        concatenating multiple sequence strings together.
+
+        Note that this method will NOT work on every possible file format
+        supported by Bio.SeqIO (e.g. some are for multiple sequences only,
+        and binary formats are not supported).
+        """
+        # See also the __format__ method
+        # See also the Bio.Align.Generic.Alignment class and its format()
+        return self.__format__(format)
+
+    def __format__(self, format_spec):
+        r"""Return the record as a string in the specified file format.
+
+        This method supports the Python format() function and f-strings.
+        The format_spec should be a lower case string supported by
+        Bio.SeqIO as a text output file format. Requesting a binary file
+        format raises a ValueError. e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
+        ...                    id="YP_025292.1", name="HokC",
+        ...                    description="toxic membrane protein")
+        ...
+        >>> format(record, "fasta")
+        '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n'
+        >>> print(f"Here is {record.id} in FASTA format:\n{record:fasta}")
+        Here is YP_025292.1 in FASTA format:
+        >YP_025292.1 toxic membrane protein
+        MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+        
+
+        See also the SeqRecord's format() method.
+        """
+        if not format_spec:
+            # Follow python convention and default to using __str__
+            return str(self)
+        from Bio import SeqIO
+
+        # Easy case, can call string-building function directly
+        if format_spec in SeqIO._FormatToString:
+            return SeqIO._FormatToString[format_spec](self)
+
+        # Harder case, make a temp handle instead
+        handle = StringIO()
+        try:
+            SeqIO.write(self, handle, format_spec)
+        except StreamModeError:
+            raise ValueError(
+                "Binary format %s cannot be used with SeqRecord format method"
+                % format_spec
+            ) from None
+        return handle.getvalue()
+
+    def __len__(self):
+        """Return the length of the sequence.
+
+        For example, using Bio.SeqIO to read in a FASTA nucleotide file:
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta")
+        >>> len(record)
+        309
+        >>> len(record.seq)
+        309
+        """
+        return len(self.seq)
+
+    def __lt__(self, other):
+        """Define the less-than operand (not implemented)."""
+        raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
+
+    def __le___(self, other):
+        """Define the less-than-or-equal-to operand (not implemented)."""
+        raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
+
+    def __eq__(self, other):
+        """Define the equal-to operand (not implemented)."""
+        raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
+
+    def __ne__(self, other):
+        """Define the not-equal-to operand (not implemented)."""
+        raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
+
+    def __gt__(self, other):
+        """Define the greater-than operand (not implemented)."""
+        raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
+
+    def __ge__(self, other):
+        """Define the greater-than-or-equal-to operand (not implemented)."""
+        raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
+
+    def __bool__(self):
+        """Boolean value of an instance of this class (True).
+
+        This behaviour is for backwards compatibility, since until the
+        __len__ method was added, a SeqRecord always evaluated as True.
+
+        Note that in comparison, a Seq object will evaluate to False if it
+        has a zero length sequence.
+
+        WARNING: The SeqRecord may in future evaluate to False when its
+        sequence is of zero length (in order to better match the Seq
+        object behaviour)!
+        """
+        return True
+
+    def __add__(self, other):
+        """Add another sequence or string to this sequence.
+
+        The other sequence can be a SeqRecord object, a Seq object (or
+        similar, e.g. a MutableSeq) or a plain Python string. If you add
+        a plain string or a Seq (like) object, the new SeqRecord will simply
+        have this appended to the existing data. However, any per letter
+        annotation will be lost:
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
+        >>> print("%s %s" % (record.id, record.seq))
+        slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+        >>> print(list(record.letter_annotations))
+        ['solexa_quality']
+
+        >>> new = record + "ACT"
+        >>> print("%s %s" % (new.id, new.seq))
+        slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT
+        >>> print(list(new.letter_annotations))
+        []
+
+        The new record will attempt to combine the annotation, but for any
+        ambiguities (e.g. different names) it defaults to omitting that
+        annotation.
+
+        >>> from Bio import SeqIO
+        >>> with open("GenBank/pBAD30.gb") as handle:
+        ...     plasmid = SeqIO.read(handle, "gb")
+        >>> print("%s %i" % (plasmid.id, len(plasmid)))
+        pBAD30 4923
+
+        Now let's cut the plasmid into two pieces, and join them back up the
+        other way round (i.e. shift the starting point on this plasmid, have
+        a look at the annotated features in the original file to see why this
+        particular split point might make sense):
+
+        >>> left = plasmid[:3765]
+        >>> right = plasmid[3765:]
+        >>> new = right + left
+        >>> print("%s %i" % (new.id, len(new)))
+        pBAD30 4923
+        >>> str(new.seq) == str(right.seq + left.seq)
+        True
+        >>> len(new.features) == len(left.features) + len(right.features)
+        True
+
+        When we add the left and right SeqRecord objects, their annotation
+        is all consistent, so it is all conserved in the new SeqRecord:
+
+        >>> new.id == left.id == right.id == plasmid.id
+        True
+        >>> new.name == left.name == right.name == plasmid.name
+        True
+        >>> new.description == plasmid.description
+        True
+        >>> new.annotations == left.annotations == right.annotations
+        True
+        >>> new.letter_annotations == plasmid.letter_annotations
+        True
+        >>> new.dbxrefs == left.dbxrefs == right.dbxrefs
+        True
+
+        However, we should point out that when we sliced the SeqRecord,
+        any annotations dictionary or dbxrefs list entries were lost.
+        You can explicitly copy them like this:
+
+        >>> new.annotations = plasmid.annotations.copy()
+        >>> new.dbxrefs = plasmid.dbxrefs[:]
+        """
+        if not isinstance(other, SeqRecord):
+            # Assume it is a string or a Seq.
+            # Note can't transfer any per-letter-annotations
+            return SeqRecord(
+                self.seq + other,
+                id=self.id,
+                name=self.name,
+                description=self.description,
+                features=self.features[:],
+                annotations=self.annotations.copy(),
+                dbxrefs=self.dbxrefs[:],
+            )
+        # Adding two SeqRecord objects... must merge annotation.
+        answer = SeqRecord(
+            self.seq + other.seq, features=self.features[:], dbxrefs=self.dbxrefs[:]
+        )
+        # Will take all the features and all the db cross refs,
+        length = len(self)
+        for f in other.features:
+            answer.features.append(f._shift(length))
+        del length
+        for ref in other.dbxrefs:
+            if ref not in answer.dbxrefs:
+                answer.dbxrefs.append(ref)
+        # Take common id/name/description/annotation
+        if self.id == other.id:
+            answer.id = self.id
+        if self.name == other.name:
+            answer.name = self.name
+        if self.description == other.description:
+            answer.description = self.description
+        for k, v in self.annotations.items():
+            if k in other.annotations and other.annotations[k] == v:
+                answer.annotations[k] = v
+        # Can append matching per-letter-annotation
+        for k, v in self.letter_annotations.items():
+            if k in other.letter_annotations:
+                answer.letter_annotations[k] = v + other.letter_annotations[k]
+        return answer
+
+    def __radd__(self, other):
+        """Add another sequence or string to this sequence (from the left).
+
+        This method handles adding a Seq object (or similar, e.g. MutableSeq)
+        or a plain Python string (on the left) to a SeqRecord (on the right).
+        See the __add__ method for more details, but for example:
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
+        >>> print("%s %s" % (record.id, record.seq))
+        slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+        >>> print(list(record.letter_annotations))
+        ['solexa_quality']
+
+        >>> new = "ACT" + record
+        >>> print("%s %s" % (new.id, new.seq))
+        slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+        >>> print(list(new.letter_annotations))
+        []
+        """
+        if isinstance(other, SeqRecord):
+            raise RuntimeError(
+                "This should have happened via the __add__ of "
+                "the other SeqRecord being added!"
+            )
+        # Assume it is a string or a Seq.
+        # Note can't transfer any per-letter-annotations
+        offset = len(other)
+        return SeqRecord(
+            other + self.seq,
+            id=self.id,
+            name=self.name,
+            description=self.description,
+            features=[f._shift(offset) for f in self.features],
+            annotations=self.annotations.copy(),
+            dbxrefs=self.dbxrefs[:],
+        )
+
+    def upper(self):
+        """Return a copy of the record with an upper case sequence.
+
+        All the annotation is preserved unchanged. e.g.
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> record = SeqRecord(Seq("acgtACGT"), id="Test",
+        ...                    description = "Made up for this example")
+        >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8]
+        >>> print(record.upper().format("fastq"))
+        @Test Made up for this example
+        ACGTACGT
+        +
+        "#$%&'()
+        
+
+        Naturally, there is a matching lower method:
+
+        >>> print(record.lower().format("fastq"))
+        @Test Made up for this example
+        acgtacgt
+        +
+        "#$%&'()
+        
+        """
+        return SeqRecord(
+            self.seq.upper(),
+            id=self.id,
+            name=self.name,
+            description=self.description,
+            dbxrefs=self.dbxrefs[:],
+            features=self.features[:],
+            annotations=self.annotations.copy(),
+            letter_annotations=self.letter_annotations.copy(),
+        )
+
+    def lower(self):
+        """Return a copy of the record with a lower case sequence.
+
+        All the annotation is preserved unchanged. e.g.
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Fasta/aster.pro", "fasta")
+        >>> print(record.format("fasta"))
+        >gi|3298468|dbj|BAA31520.1| SAMIPF
+        GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG
+        VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
+        
+        >>> print(record.lower().format("fasta"))
+        >gi|3298468|dbj|BAA31520.1| SAMIPF
+        gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg
+        vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani
+        
+
+        To take a more annotation rich example,
+
+        >>> from Bio import SeqIO
+        >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl")
+        >>> len(old.features)
+        3
+        >>> new = old.lower()
+        >>> len(old.features) == len(new.features)
+        True
+        >>> old.annotations["organism"] == new.annotations["organism"]
+        True
+        >>> old.dbxrefs == new.dbxrefs
+        True
+        """
+        return SeqRecord(
+            self.seq.lower(),
+            id=self.id,
+            name=self.name,
+            description=self.description,
+            dbxrefs=self.dbxrefs[:],
+            features=self.features[:],
+            annotations=self.annotations.copy(),
+            letter_annotations=self.letter_annotations.copy(),
+        )
+
+    def reverse_complement(
+        self,
+        id=False,
+        name=False,
+        description=False,
+        features=True,
+        annotations=False,
+        letter_annotations=True,
+        dbxrefs=False,
+    ):
+        """Return new SeqRecord with reverse complement sequence.
+
+        By default the new record does NOT preserve the sequence identifier,
+        name, description, general annotation or database cross-references -
+        these are unlikely to apply to the reversed sequence.
+
+        You can specify the returned record's id, name and description as
+        strings, or True to keep that of the parent, or False for a default.
+
+        You can specify the returned record's features with a list of
+        SeqFeature objects, or True to keep that of the parent, or False to
+        omit them. The default is to keep the original features (with the
+        strand and locations adjusted).
+
+        You can also specify both the returned record's annotations and
+        letter_annotations as dictionaries, True to keep that of the parent,
+        or False to omit them. The default is to keep the original
+        annotations (with the letter annotations reversed).
+
+        To show what happens to the pre-letter annotations, consider an
+        example Solexa variant FASTQ file with a single entry, which we'll
+        read in as a SeqRecord:
+
+        >>> from Bio import SeqIO
+        >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa")
+        >>> print("%s %s" % (record.id, record.seq))
+        slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
+        >>> print(list(record.letter_annotations))
+        ['solexa_quality']
+        >>> print(record.letter_annotations["solexa_quality"])
+        [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
+
+        Now take the reverse complement, here we explicitly give a new
+        identifier (the old identifier with a suffix):
+
+        >>> rc_record = record.reverse_complement(id=record.id + "_rc")
+        >>> print("%s %s" % (rc_record.id, rc_record.seq))
+        slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+
+        Notice that the per-letter-annotations have also been reversed,
+        although this may not be appropriate for all cases.
+
+        >>> print(rc_record.letter_annotations["solexa_quality"])
+        [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
+
+        Now for the features, we need a different example. Parsing a GenBank
+        file is probably the easiest way to get an nice example with features
+        in it...
+
+        >>> from Bio import SeqIO
+        >>> with open("GenBank/pBAD30.gb") as handle:
+        ...     plasmid = SeqIO.read(handle, "gb")
+        >>> print("%s %i" % (plasmid.id, len(plasmid)))
+        pBAD30 4923
+        >>> plasmid.seq
+        Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG')
+        >>> len(plasmid.features)
+        13
+
+        Now, let's take the reverse complement of this whole plasmid:
+
+        >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc")
+        >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid)))
+        pBAD30_rc 4923
+        >>> rc_plasmid.seq
+        Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC')
+        >>> len(rc_plasmid.features)
+        13
+
+        Let's compare the first CDS feature - it has gone from being the
+        second feature (index 1) to the second last feature (index -2), its
+        strand has changed, and the location switched round.
+
+        >>> print(plasmid.features[1])
+        type: CDS
+        location: [1081:1960](-)
+        qualifiers:
+            Key: label, Value: ['araC']
+            Key: note, Value: ['araC regulator of the arabinose BAD promoter']
+            Key: vntifkey, Value: ['4']
+        
+        >>> print(rc_plasmid.features[-2])
+        type: CDS
+        location: [2963:3842](+)
+        qualifiers:
+            Key: label, Value: ['araC']
+            Key: note, Value: ['araC regulator of the arabinose BAD promoter']
+            Key: vntifkey, Value: ['4']
+        
+
+        You can check this new location, based on the length of the plasmid:
+
+        >>> len(plasmid) - 1081
+        3842
+        >>> len(plasmid) - 1960
+        2963
+
+        Note that if the SeqFeature annotation includes any strand specific
+        information (e.g. base changes for a SNP), this information is not
+        amended, and would need correction after the reverse complement.
+
+        Note trying to reverse complement a protein SeqRecord raises an
+        exception:
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> protein_rec = SeqRecord(Seq("MAIVMGR"), id="Test",
+        ...                         annotations={"molecule_type": "protein"})
+        >>> protein_rec.reverse_complement()
+        Traceback (most recent call last):
+           ...
+        ValueError: Proteins do not have complements!
+
+        If you have RNA without any U bases, it must be annotated as RNA
+        otherwise it will be treated as DNA by default with A mapped to T:
+
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> rna1 = SeqRecord(Seq("ACG"), id="Test")
+        >>> rna2 = SeqRecord(Seq("ACG"), id="Test", annotations={"molecule_type": "RNA"})
+        >>> print(rna1.reverse_complement(id="RC", description="unk").format("fasta"))
+        >RC unk
+        CGT
+        
+        >>> print(rna2.reverse_complement(id="RC", description="RNA").format("fasta"))
+        >RC RNA
+        CGU
+        
+
+        Also note you can reverse complement a SeqRecord using a MutableSeq:
+
+        >>> from Bio.Seq import MutableSeq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> rec = SeqRecord(MutableSeq("ACGT"), id="Test")
+        >>> rec.seq[0] = "T"
+        >>> print("%s %s" % (rec.id, rec.seq))
+        Test TCGT
+        >>> rc = rec.reverse_complement(id=True)
+        >>> print("%s %s" % (rc.id, rc.seq))
+        Test ACGA
+        """
+        from Bio.Seq import Seq, MutableSeq  # Lazy to avoid circular imports
+
+        if "protein" in self.annotations.get("molecule_type", ""):
+            raise ValueError("Proteins do not have complements!")
+        if "RNA" in self.annotations.get("molecule_type", ""):
+            if isinstance(self.seq, MutableSeq):
+                # Does not currently have reverse_complement_rna method:
+                answer = SeqRecord(Seq(self.seq).reverse_complement_rna())
+            else:
+                answer = SeqRecord(self.seq.reverse_complement_rna())
+        else:
+            # Default to DNA
+            if isinstance(self.seq, MutableSeq):
+                # Currently the MutableSeq reverse complement is in situ
+                answer = SeqRecord(Seq(self.seq).reverse_complement())
+            else:
+                answer = SeqRecord(self.seq.reverse_complement())
+        if isinstance(id, str):
+            answer.id = id
+        elif id:
+            answer.id = self.id
+        if isinstance(name, str):
+            answer.name = name
+        elif name:
+            answer.name = self.name
+        if isinstance(description, str):
+            answer.description = description
+        elif description:
+            answer.description = self.description
+        if isinstance(dbxrefs, list):
+            answer.dbxrefs = dbxrefs
+        elif dbxrefs:
+            # Copy the old dbxrefs
+            answer.dbxrefs = self.dbxrefs[:]
+        if isinstance(features, list):
+            answer.features = features
+        elif features:
+            # Copy the old features, adjusting location and string
+            length = len(answer)
+            answer.features = [f._flip(length) for f in self.features]
+            # The old list should have been sorted by start location,
+            # reversing it will leave it sorted by what is now the end position,
+            # so we need to resort in case of overlapping features.
+            # NOTE - In the common case of gene before CDS (and similar) with
+            # the exact same locations, this will still maintain gene before CDS
+            answer.features.sort(key=lambda x: x.location.start.position)
+        if isinstance(annotations, dict):
+            answer.annotations = annotations
+        elif annotations:
+            # Copy the old annotations,
+            answer.annotations = self.annotations.copy()
+        if isinstance(letter_annotations, dict):
+            answer.letter_annotations = letter_annotations
+        elif letter_annotations:
+            # Copy the old per letter annotations, reversing them
+            for key, value in self.letter_annotations.items():
+                answer._per_letter_annotations[key] = value[::-1]
+        return answer
+
+    def translate(
+        self,
+        # Seq translation arguments:
+        table="Standard",
+        stop_symbol="*",
+        to_stop=False,
+        cds=False,
+        gap=None,
+        # SeqRecord annotation arguments:
+        id=False,
+        name=False,
+        description=False,
+        features=False,
+        annotations=False,
+        letter_annotations=False,
+        dbxrefs=False,
+    ):
+        """Return new SeqRecord with translated sequence.
+
+        This calls the record's .seq.translate() method (which describes
+        the translation related arguments, like table for the genetic code),
+
+        By default the new record does NOT preserve the sequence identifier,
+        name, description, general annotation or database cross-references -
+        these are unlikely to apply to the translated sequence.
+
+        You can specify the returned record's id, name and description as
+        strings, or True to keep that of the parent, or False for a default.
+
+        You can specify the returned record's features with a list of
+        SeqFeature objects, or False (default) to omit them.
+
+        You can also specify both the returned record's annotations and
+        letter_annotations as dictionaries, True to keep that of the parent
+        (annotations only), or False (default) to omit them.
+
+        e.g. Loading a FASTA gene and translating it,
+
+        >>> from Bio import SeqIO
+        >>> gene_record = SeqIO.read("Fasta/sweetpea.nu", "fasta")
+        >>> print(gene_record.format("fasta"))
+        >gi|3176602|gb|U78617.1|LOU78617 Lathyrus odoratus phytochrome A (PHYA) gene, partial cds
+        CAGGCTGCGCGGTTTCTATTTATGAAGAACAAGGTCCGTATGATAGTTGATTGTCATGCA
+        AAACATGTGAAGGTTCTTCAAGACGAAAAACTCCCATTTGATTTGACTCTGTGCGGTTCG
+        ACCTTAAGAGCTCCACATAGTTGCCATTTGCAGTACATGGCTAACATGGATTCAATTGCT
+        TCATTGGTTATGGCAGTGGTCGTCAATGACAGCGATGAAGATGGAGATAGCCGTGACGCA
+        GTTCTACCACAAAAGAAAAAGAGACTTTGGGGTTTGGTAGTTTGTCATAACACTACTCCG
+        AGGTTTGTT
+        
+
+        And now translating the record, specifying the new ID and description:
+
+        >>> protein_record = gene_record.translate(table=11,
+        ...                                        id="phya",
+        ...                                        description="translation")
+        >>> print(protein_record.format("fasta"))
+        >phya translation
+        QAARFLFMKNKVRMIVDCHAKHVKVLQDEKLPFDLTLCGSTLRAPHSCHLQYMANMDSIA
+        SLVMAVVVNDSDEDGDSRDAVLPQKKKRLWGLVVCHNTTPRFV
+        
+
+        """
+        if "protein" == self.annotations.get("molecule_type", ""):
+            raise ValueError("Proteins cannot be translated!")
+        answer = SeqRecord(
+            self.seq.translate(
+                table=table, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap
+            )
+        )
+        if isinstance(id, str):
+            answer.id = id
+        elif id:
+            answer.id = self.id
+        if isinstance(name, str):
+            answer.name = name
+        elif name:
+            answer.name = self.name
+        if isinstance(description, str):
+            answer.description = description
+        elif description:
+            answer.description = self.description
+        if isinstance(dbxrefs, list):
+            answer.dbxrefs = dbxrefs
+        elif dbxrefs:
+            # Copy the old dbxrefs
+            answer.dbxrefs = self.dbxrefs[:]
+        if isinstance(features, list):
+            answer.features = features
+        elif features:
+            # Does not make sense to copy old features as locations wrong
+            raise TypeError("Unexpected features argument %r" % features)
+        if isinstance(annotations, dict):
+            answer.annotations = annotations
+        elif annotations:
+            # Copy the old annotations
+            answer.annotations = self.annotations.copy()
+        # Set/update to protein:
+        answer.annotations["molecule_type"] = "protein"
+        if isinstance(letter_annotations, dict):
+            answer.letter_annotations = letter_annotations
+        elif letter_annotations:
+            # Does not make sense to copy these as length now wrong
+            raise TypeError(
+                "Unexpected letter_annotations argument %r" % letter_annotations
+            )
+        return answer
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqUtils/CheckSum.py b/code/lib/Bio/SeqUtils/CheckSum.py
new file mode 100644
index 0000000..73f3e72
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/CheckSum.py
@@ -0,0 +1,145 @@
+# Copyright 2002 by Yves Bastide and Brad Chapman.
+# Copyright 2007 by Sebastian Bassi
+# All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Functions to calculate assorted sequence checksums."""
+
+# crc32, crc64, gcg, and seguid
+# crc64 is adapted from BioPerl
+
+
+import binascii
+
+
+def crc32(seq):
+    """Return the crc32 checksum for a sequence (string or Seq object).
+
+    Note that the case is important:
+
+    >>> crc32("ACGTACGTACGT")
+    20049947
+    >>> crc32("acgtACGTacgt")
+    1688586483
+
+    """
+    try:
+        # Assume it's a Seq object
+        s = bytes(seq)
+    except TypeError:
+        # Assume it's a string
+        s = seq.encode()
+    return binascii.crc32(s)
+
+
+def _init_table_h():
+    _table_h = []
+    for i in range(256):
+        part_l = i
+        part_h = 0
+        for j in range(8):
+            rflag = part_l & 1
+            part_l >>= 1
+            if part_h & 1:
+                part_l |= 1 << 31
+            part_h >>= 1
+            if rflag:
+                part_h ^= 0xD8000000
+        _table_h.append(part_h)
+    return _table_h
+
+
+# Initialisation
+_table_h = _init_table_h()
+
+
+def crc64(s):
+    """Return the crc64 checksum for a sequence (string or Seq object).
+
+    Note that the case is important:
+
+    >>> crc64("ACGTACGTACGT")
+    'CRC-C4FBB762C4A87EBD'
+    >>> crc64("acgtACGTacgt")
+    'CRC-DA4509DC64A87EBD'
+
+    """
+    crcl = 0
+    crch = 0
+    for c in s:
+        shr = (crch & 0xFF) << 24
+        temp1h = crch >> 8
+        temp1l = (crcl >> 8) | shr
+        idx = (crcl ^ ord(c)) & 0xFF
+        crch = temp1h ^ _table_h[idx]
+        crcl = temp1l
+
+    return "CRC-%08X%08X" % (crch, crcl)
+
+
+def gcg(seq):
+    """Return the GCG checksum (int) for a sequence (string or Seq object).
+
+    Given a nucleotide or amino-acid sequence (or any string),
+    returns the GCG checksum (int). Checksum used by GCG program.
+    seq type = str.
+
+    Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
+    with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
+
+    All sequences are converted to uppercase.
+
+    >>> gcg("ACGTACGTACGT")
+    5688
+    >>> gcg("acgtACGTacgt")
+    5688
+
+    """
+    index = checksum = 0
+    for char in seq:
+        index += 1
+        checksum += index * ord(char.upper())
+        if index == 57:
+            index = 0
+    return checksum % 10000
+
+
+def seguid(seq):
+    """Return the SEGUID (string) for a sequence (string or Seq object).
+
+    Given a nucleotide or amino-acid sequence (or any string),
+    returns the SEGUID string (A SEquence Globally Unique IDentifier).
+    seq type = str.
+
+    Note that the case is not important:
+
+    >>> seguid("ACGTACGTACGT")
+    'If6HIvcnRSQDVNiAoefAzySc6i4'
+    >>> seguid("acgtACGTacgt")
+    'If6HIvcnRSQDVNiAoefAzySc6i4'
+
+    For more information about SEGUID, see:
+    http://bioinformatics.anl.gov/seguid/
+    https://doi.org/10.1002/pmic.200600032
+    """
+    import hashlib
+    import base64
+
+    m = hashlib.sha1()
+    try:
+        # Assume it's a Seq object
+        seq = bytes(seq)
+    except TypeError:
+        # Assume it's a string
+        seq = seq.encode()
+    m.update(seq.upper())
+    tmp = base64.encodebytes(m.digest())
+    return tmp.decode().replace("\n", "").rstrip("=")
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqUtils/CodonUsage.py b/code/lib/Bio/SeqUtils/CodonUsage.py
new file mode 100644
index 0000000..c4e95e0
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/CodonUsage.py
@@ -0,0 +1,187 @@
+# Copyright 2003 Yair Benita.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Methods for codon usage calculations."""
+
+import math
+from .CodonUsageIndices import SharpEcoliIndex
+from Bio import SeqIO  # To parse a FASTA file
+
+# Turn black code style off
+# fmt: off
+
+CodonsDict = {
+    "TTT": 0, "TTC": 0, "TTA": 0, "TTG": 0,
+    "CTT": 0, "CTC": 0, "CTA": 0, "CTG": 0,
+    "ATT": 0, "ATC": 0, "ATA": 0, "ATG": 0,
+    "GTT": 0, "GTC": 0, "GTA": 0, "GTG": 0,
+    "TAT": 0, "TAC": 0, "TAA": 0, "TAG": 0,
+    "CAT": 0, "CAC": 0, "CAA": 0, "CAG": 0,
+    "AAT": 0, "AAC": 0, "AAA": 0, "AAG": 0,
+    "GAT": 0, "GAC": 0, "GAA": 0, "GAG": 0,
+    "TCT": 0, "TCC": 0, "TCA": 0, "TCG": 0,
+    "CCT": 0, "CCC": 0, "CCA": 0, "CCG": 0,
+    "ACT": 0, "ACC": 0, "ACA": 0, "ACG": 0,
+    "GCT": 0, "GCC": 0, "GCA": 0, "GCG": 0,
+    "TGT": 0, "TGC": 0, "TGA": 0, "TGG": 0,
+    "CGT": 0, "CGC": 0, "CGA": 0, "CGG": 0,
+    "AGT": 0, "AGC": 0, "AGA": 0, "AGG": 0,
+    "GGT": 0, "GGC": 0, "GGA": 0, "GGG": 0}
+
+# Turn black code style on
+# fmt: on
+
+
+# this dictionary shows which codons encode the same AA
+SynonymousCodons = {
+    "CYS": ["TGT", "TGC"],
+    "ASP": ["GAT", "GAC"],
+    "SER": ["TCT", "TCG", "TCA", "TCC", "AGC", "AGT"],
+    "GLN": ["CAA", "CAG"],
+    "MET": ["ATG"],
+    "ASN": ["AAC", "AAT"],
+    "PRO": ["CCT", "CCG", "CCA", "CCC"],
+    "LYS": ["AAG", "AAA"],
+    "STOP": ["TAG", "TGA", "TAA"],
+    "THR": ["ACC", "ACA", "ACG", "ACT"],
+    "PHE": ["TTT", "TTC"],
+    "ALA": ["GCA", "GCC", "GCG", "GCT"],
+    "GLY": ["GGT", "GGG", "GGA", "GGC"],
+    "ILE": ["ATC", "ATA", "ATT"],
+    "LEU": ["TTA", "TTG", "CTC", "CTT", "CTG", "CTA"],
+    "HIS": ["CAT", "CAC"],
+    "ARG": ["CGA", "CGC", "CGG", "CGT", "AGG", "AGA"],
+    "TRP": ["TGG"],
+    "VAL": ["GTA", "GTC", "GTG", "GTT"],
+    "GLU": ["GAG", "GAA"],
+    "TYR": ["TAT", "TAC"],
+}
+
+
+class CodonAdaptationIndex:
+    """A codon adaptation index (CAI) implementation.
+
+    Implements the codon adaptation index (CAI) described by Sharp and
+    Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).
+
+    NOTE - This implementation does not currently cope with alternative genetic
+    codes: only the synonymous codons in the standard table are considered.
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.index = {}
+        self.codon_count = {}
+
+    # use this method with predefined CAI index
+    def set_cai_index(self, index):
+        """Set up an index to be used when calculating CAI for a gene.
+
+        Just pass a dictionary similar to the SharpEcoliIndex in the
+        CodonUsageIndices module.
+        """
+        self.index = index
+
+    def generate_index(self, fasta_file):
+        """Generate a codon usage index from a FASTA file of CDS sequences.
+
+        Takes a location of a Fasta file containing CDS sequences
+        (which must all have a whole number of codons) and generates a codon
+        usage index.
+
+        RCSU values
+        """
+        # first make sure we're not overwriting an existing index:
+        if self.index != {} or self.codon_count != {}:
+            raise ValueError(
+                "an index has already been set or a codon count "
+                "has been done. Cannot overwrite either."
+            )
+
+        # count codon occurrences in the file.
+        self._count_codons(fasta_file)
+
+        # now to calculate the index we first need to sum the number of times
+        # synonymous codons were used all together.
+        for aa in SynonymousCodons:
+            total = 0.0
+            # RCSU values are CodonCount/((1/num of synonymous codons) * sum of
+            # all synonymous codons)
+            rcsu = []
+            codons = SynonymousCodons[aa]
+
+            for codon in codons:
+                total += self.codon_count[codon]
+
+            # calculate the RSCU value for each of the codons
+            for codon in codons:
+                denominator = float(total) / len(codons)
+                rcsu.append(self.codon_count[codon] / denominator)
+
+            # now generate the index W=RCSUi/RCSUmax:
+            rcsu_max = max(rcsu)
+            for codon_index, codon in enumerate(codons):
+                self.index[codon] = rcsu[codon_index] / rcsu_max
+
+    def cai_for_gene(self, dna_sequence):
+        """Calculate the CAI (float) for the provided DNA sequence (string).
+
+        This method uses the Index (either the one you set or the one you
+        generated) and returns the CAI for the DNA sequence.
+        """
+        cai_value, cai_length = 0, 0
+
+        # if no index is set or generated, the default SharpEcoliIndex will
+        # be used.
+        if self.index == {}:
+            self.set_cai_index(SharpEcoliIndex)
+
+        if dna_sequence.islower():
+            dna_sequence = dna_sequence.upper()
+
+        for i in range(0, len(dna_sequence), 3):
+            codon = dna_sequence[i : i + 3]
+            if codon in self.index:
+                # these two codons are always one, exclude them:
+                if codon not in ["ATG", "TGG"]:
+                    cai_value += math.log(self.index[codon])
+                    cai_length += 1
+            # some indices may not include stop codons:
+            elif codon not in ["TGA", "TAA", "TAG"]:
+                raise TypeError(
+                    "illegal codon in sequence: %s.\n%s" % (codon, self.index)
+                )
+
+        return math.exp(cai_value / (cai_length - 1.0))
+
+    def _count_codons(self, fasta_file):
+        with open(fasta_file) as handle:
+
+            # make the codon dictionary local
+            self.codon_count = CodonsDict.copy()
+
+            # iterate over sequence and count all the codons in the FastaFile.
+            for cur_record in SeqIO.parse(handle, "fasta"):
+                # make sure the sequence is lower case
+                if str(cur_record.seq).islower():
+                    dna_sequence = str(cur_record.seq).upper()
+                else:
+                    dna_sequence = str(cur_record.seq)
+                for i in range(0, len(dna_sequence), 3):
+                    codon = dna_sequence[i : i + 3]
+                    if codon in self.codon_count:
+                        self.codon_count[codon] += 1
+                    else:
+                        raise TypeError(
+                            "illegal codon %s in gene: %s" % (codon, cur_record.id)
+                        )
+
+    def print_index(self):
+        """Print out the index used.
+
+        This just gives the index when the objects is printed.
+        """
+        for i in sorted(self.index):
+            print("%s\t%.3f" % (i, self.index[i]))
diff --git a/code/lib/Bio/SeqUtils/CodonUsageIndices.py b/code/lib/Bio/SeqUtils/CodonUsageIndices.py
new file mode 100644
index 0000000..99aa097
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/CodonUsageIndices.py
@@ -0,0 +1,28 @@
+# Copyright 2003 Yair Benita.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Codon adaption indxes, including Sharp and Li (1987) E. coli index.
+
+Currently this module only defines a single codon adaption index from
+Sharp & Li, Nucleic Acids Res. 1987.
+"""
+# Turn black code style off
+# fmt: off
+
+SharpEcoliIndex = {
+    "GCA": 0.586, "GCC": 0.122, "GCG": 0.424, "GCT": 1, "AGA": 0.004,
+    "AGG": 0.002, "CGA": 0.004, "CGC": 0.356, "CGG": 0.004, "CGT": 1, "AAC": 1,
+    "AAT": 0.051, "GAC": 1, "GAT": 0.434, "TGC": 1, "TGT": 0.5, "CAA": 0.124,
+    "CAG": 1, "GAA": 1, "GAG": 0.259, "GGA": 0.01, "GGC": 0.724, "GGG": 0.019,
+    "GGT": 1, "CAC": 1, "CAT": 0.291, "ATA": 0.003, "ATC": 1, "ATT": 0.185,
+    "CTA": 0.007, "CTC": 0.037, "CTG": 1, "CTT": 0.042, "TTA": 0.02,
+    "TTG": 0.02, "AAA": 1, "AAG": 0.253, "ATG": 1, "TTC": 1, "TTT": 0.296,
+    "CCA": 0.135, "CCC": 0.012, "CCG": 1, "CCT": 0.07, "AGC": 0.41,
+    "AGT": 0.085, "TCA": 0.077, "TCC": 0.744, "TCG": 0.017, "TCT": 1,
+    "ACA": 0.076, "ACC": 1, "ACG": 0.099, "ACT": 0.965, "TGG": 1, "TAC": 1,
+    "TAT": 0.239, "GTA": 0.495, "GTC": 0.066, "GTG": 0.221, "GTT": 1}
+
+# Turn black code style on
+# fmt: on
diff --git a/code/lib/Bio/SeqUtils/IsoelectricPoint.py b/code/lib/Bio/SeqUtils/IsoelectricPoint.py
new file mode 100644
index 0000000..f4090f8
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/IsoelectricPoint.py
@@ -0,0 +1,161 @@
+# Copyright 2003 Yair Benita.  All rights reserved.
+# Revisions copyright 2020 by Tianyi Shi.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Calculate isoelectric points of polypeptides using methods of Bjellqvist.
+
+pK values and the methos are taken from::
+
+    * Bjellqvist, B.,Hughes, G.J., Pasquali, Ch., Paquet, N., Ravier, F.,
+    Sanchez, J.-Ch., Frutiger, S. & Hochstrasser, D.F.
+    The focusing positions of polypeptides in immobilized pH gradients can be
+    predicted from their amino acid sequences. Electrophoresis 1993, 14,
+    1023-1031.
+
+    * Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E.
+    Reference points for comparisons of two-dimensional maps of proteins from
+    different human cell types defined in a pH scale where isoelectric points
+    correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539.
+
+I designed the algorithm according to a note by David L. Tabb, available at:
+http://fields.scripps.edu/DTASelect/20010710-pI-Algorithm.pdf
+"""
+
+positive_pKs = {"Nterm": 7.5, "K": 10.0, "R": 12.0, "H": 5.98}
+negative_pKs = {"Cterm": 3.55, "D": 4.05, "E": 4.45, "C": 9.0, "Y": 10.0}
+pKcterminal = {"D": 4.55, "E": 4.75}
+pKnterminal = {
+    "A": 7.59,
+    "M": 7.0,
+    "S": 6.93,
+    "P": 8.36,
+    "T": 6.82,
+    "V": 7.44,
+    "E": 7.7,
+}
+charged_aas = ("K", "R", "H", "D", "E", "C", "Y")
+
+
+class IsoelectricPoint:
+    """A class for calculating the IEP or charge at given pH of a protein.
+
+    Parameters
+    ----------
+    :protein_sequence: A ``Bio.Seq`` or string object containing a protein
+                       sequence.
+    :aa_content: A dictionary with amino acid letters as keys and its
+                 occurrences as integers, e.g. ``{"A": 3, "C": 0, ...}``.
+                 Default: ``None``. If ``None``, the dic will be calculated
+                 from the given sequence.
+
+    Methods
+    -------
+    :charge_at_pH(pH):  Calculates the charge of the protein for a given pH
+    :pi():              Calculates the isoelectric point
+
+
+    Examples
+    --------
+    The methods of this class can either be accessed from the class itself
+    or from a ``ProtParam.ProteinAnalysis`` object (with partially different
+    names):
+
+    >>> from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
+    >>> protein = IP("INGAR")
+    >>> print(f"IEP of peptide {protein.sequence} is {protein.pi():.2f}")
+    IEP of peptide INGAR is 9.75
+    >>> print(f"Its charge at pH 7 is {protein.charge_at_pH(7.0):.2f}")
+    Its charge at pH 7 is 0.76
+
+
+    >>> from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
+    >>> protein = PA("PETER")
+    >>> print(f"IEP of {protein.sequence}: {protein.isoelectric_point():.2f}")
+    IEP of PETER: 4.53
+    >>> print(f"Charge at pH 4.53: {protein.charge_at_pH(4.53):.2f}")
+    Charge at pH 4.53: 0.00
+
+    """
+
+    def __init__(self, protein_sequence, aa_content=None):
+        """Initialize the class."""
+        self.sequence = str(protein_sequence).upper()
+        if not aa_content:
+            from Bio.SeqUtils.ProtParam import ProteinAnalysis as _PA
+
+            aa_content = _PA(self.sequence).count_amino_acids()
+        self.charged_aas_content = self._select_charged(aa_content)
+
+        self.pos_pKs, self.neg_pKs = self._update_pKs_tables()
+
+    # This function creates a dictionary with the contents of each charged aa,
+    # plus Cterm and Nterm.
+    def _select_charged(self, aa_content):
+        charged = {}
+        for aa in charged_aas:
+            charged[aa] = float(aa_content[aa])
+        charged["Nterm"] = 1.0
+        charged["Cterm"] = 1.0
+        return charged
+
+    def _update_pKs_tables(self):
+        """Update pKs tables with seq specific values for N- and C-termini."""
+        pos_pKs = positive_pKs.copy()
+        neg_pKs = negative_pKs.copy()
+        nterm, cterm = self.sequence[0], self.sequence[-1]
+        if nterm in pKnterminal:
+            pos_pKs["Nterm"] = pKnterminal[nterm]
+        if cterm in pKcterminal:
+            neg_pKs["Cterm"] = pKcterminal[cterm]
+        return pos_pKs, neg_pKs
+
+    def charge_at_pH(self, pH):
+        """Calculate the charge of a protein at given pH."""
+        # derivation:
+        #   Henderson Hasselbalch equation: pH = pKa + log([A-]/[HA])
+        #   Rearranging: [HA]/[A-] = 10 ** (pKa - pH)
+        #   partial_charge =
+        #       [A-]/[A]total = [A-]/([A-] + [HA]) = 1 / { ([A-] + [HA])/[A-] } =
+        #       1 / (1 + [HA]/[A-]) = 1 / (1 + 10 ** (pKa - pH)) for acidic residues;
+        #                             1 / (1 + 10 ** (pH - pKa)) for basic residues
+        positive_charge = 0.0
+        for aa, pK in self.pos_pKs.items():
+            partial_charge = 1.0 / (10 ** (pH - pK) + 1.0)
+            positive_charge += self.charged_aas_content[aa] * partial_charge
+
+        negative_charge = 0.0
+        for aa, pK in self.neg_pKs.items():
+            partial_charge = 1.0 / (10 ** (pK - pH) + 1.0)
+            negative_charge += self.charged_aas_content[aa] * partial_charge
+
+        return positive_charge - negative_charge
+
+    # This is the action function, it tries different pH until the charge of
+    # the protein is 0 (or close).
+    def pi(self, pH=7.775, min_=4.05, max_=12):
+        r"""Calculate and return the isoelectric point as float.
+
+        This is a recursive function that uses bisection method.
+        Wiki on bisection: https://en.wikipedia.org/wiki/Bisection_method
+
+        Arguments:
+         - pH: the pH at which the current charge of the protein is computed.
+           This pH lies at the centre of the interval (mean of `min_` and `max_`).
+         - min\_: the minimum of the interval. Initial value defaults to 4.05,
+           which is below the theoretical minimum, when the protein is composed
+           exclusively of aspartate.
+         - max\_: the maximum of the the interval. Initial value defaults to 12,
+           which is above the theoretical maximum, when the protein is composed
+           exclusively of arginine.
+        """
+        charge = self.charge_at_pH(pH)
+        if max_ - min_ > 0.0001:
+            if charge > 0.0:
+                min_ = pH
+            else:
+                max_ = pH
+            next_pH = (min_ + max_) / 2
+            return self.pi(next_pH, min_, max_)
+        return pH
diff --git a/code/lib/Bio/SeqUtils/MeltingTemp.py b/code/lib/Bio/SeqUtils/MeltingTemp.py
new file mode 100644
index 0000000..90c47ae
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/MeltingTemp.py
@@ -0,0 +1,1136 @@
+# Copyright 2004-2008 by Sebastian Bassi.
+# Copyright 2013-2018 by Markus Piotrowski.
+# All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Calculate the melting temperature of nucleotide sequences.
+
+This module contains three different methods to calculate the melting
+temperature of oligonucleotides:
+
+1. Tm_Wallace: 'Rule of thumb'
+2. Tm_GC: Empirical formulas based on GC content. Salt and mismatch corrections
+   can be included.
+3. Tm_NN: Calculation based on nearest neighbor thermodynamics. Several tables
+   for DNA/DNA, DNA/RNA and RNA/RNA hybridizations are included.
+   Correction for mismatches, dangling ends, salt concentration and other
+   additives are available.
+
+Tm_staluc is the 'old' NN calculation and is kept for compatibility. It is,
+however, recommended to use Tm_NN instead, since Tm_staluc may be depreceated
+in the future. Also, Tm_NN has much more options. Using Tm_staluc and Tm_NN
+with default parameters gives (essentially) the same results.
+
+General parameters for most Tm methods:
+ - seq -- A Biopython sequence object or a string.
+ - check -- Checks if the sequence is valid for the given method (default=
+   True). In general, whitespaces and non-base characters are removed and
+   characters are converted to uppercase. RNA will be backtranscribed.
+ - strict -- Do not allow base characters or neighbor duplex keys (e.g.
+   'AT/NA') that could not or not unambigiously be evaluated for the respective
+   method (default=True). Note that W (= A or T) and S (= C or G) are not
+   ambiguous for Tm_Wallace and Tm_GC. If 'False', average values (if
+   applicable) will be used.
+
+This module is not able to detect self-complementary and it will not use
+alignment tools to align an oligonucleotide sequence to its target sequence.
+Thus it can not detect dangling-ends and mismatches by itself (don't even think
+about bulbs and loops). These parameters have to be handed over to the
+respective method.
+
+Other public methods of this module:
+ - make_table     : To create a table with thermodynamic data.
+ - salt_correction: To adjust Tm to a given salt concentration by different
+   formulas. This method is called from Tm_GC and Tm_NN but may
+   also be accessed 'manually'. It returns a correction term, not
+   a corrected Tm!
+ - chem_correction: To adjust Tm regarding the chemical additives DMSO and
+   formaldehyde. The method returns a corrected Tm. Chemical
+   correction is not an integral part of the Tm methods and must
+   be called additionally.
+
+For example:
+
+    >>> from Bio.SeqUtils import MeltingTemp as mt
+    >>> from Bio.Seq import Seq
+    >>> mystring = 'CGTTCCAAAGATGTGGGCATGAGCTTAC'
+    >>> myseq = Seq(mystring)
+    >>> print('%0.2f' % mt.Tm_Wallace(mystring))
+    84.00
+    >>> print('%0.2f' % mt.Tm_Wallace(myseq))
+    84.00
+    >>> print('%0.2f' % mt.Tm_GC(myseq))
+    58.73
+    >>> print('%0.2f' % mt.Tm_NN(myseq))
+    60.32
+
+Tm_NN with default values gives same result as 'old' Tm_staluc. However, values
+differ for RNA, since Tm_staluc had some errors for RNA calculation. These
+errors have been fixed in this version.
+
+New Tm_NN can do slightly more:
+Using different thermodynamic tables, e.g. from Breslauer '86 or Sugimoto '96:
+
+    >>> print('%0.2f' % mt.Tm_NN(myseq, nn_table=mt.DNA_NN1))  # Breslauer '86
+    72.19
+    >>> print('%0.2f' % mt.Tm_NN(myseq, nn_table=mt.DNA_NN2))  # Sugimoto '96
+    65.47
+
+Tables for RNA and RNA/DNA hybrids are included:
+
+    >>> print('%0.2f' % mt.Tm_NN(myseq, nn_table=mt.RNA_NN1))  # Freier '86
+    73.35
+    >>> print('%0.2f' % mt.Tm_NN(myseq, nn_table=mt.R_DNA_NN1))  # Sugimoto '95
+    58.45
+
+Several types of salc correction (for Tm_NN and Tm_GC):
+
+    >>> for i in range(1, 8):
+    ...     print("Type: %d, Tm: %0.2f" % (i, Tm_NN(myseq, saltcorr=i)))
+    ...
+    Type: 1, Tm: 54.27
+    Type: 2, Tm: 54.02
+    Type: 3, Tm: 59.60
+    Type: 4, Tm: 60.64
+    Type: 5, Tm: 60.32
+    Type: 6, Tm: 59.78
+    Type: 7, Tm: 59.78
+
+Correction for other monovalent cations (K+, Tris), Mg2+ and dNTPs according
+to von Ahsen et al. (2001) or Owczarzy et al. (2008) (for Tm_NN and Tm_GC):
+
+    >>> print('%0.2f' % mt.Tm_NN(myseq, Na=50, Tris=10))
+    60.79
+    >>> print('%0.2f' % mt.Tm_NN(myseq, Na=50, Tris=10, Mg=1.5))
+    67.39
+    >>> print('%0.2f' % mt.Tm_NN(myseq, Na=50, Tris=10, Mg=1.5, saltcorr=7))
+    66.81
+    >>> print('%0.2f' % mt.Tm_NN(myseq, Na=50, Tris=10, Mg=1.5, dNTPs=0.6,
+    ...                          saltcorr=7))
+    66.04
+
+Dangling ends and mismatches, e.g.::
+
+    Oligo:     CGTTCCaAAGATGTGGGCATGAGCTTAC       CGTTCCaAAGATGTGGGCATGAGCTTAC
+               ::::::X:::::::::::::::::::::  or   ::::::X:::::::::::::::::::::
+    Template:  GCAAGGcTTCTACACCCGTACTCGAATG      TGCAAGGcTTCTACACCCGTACTCGAATGC
+
+Here:
+
+    >>> print('%0.2f' % mt.Tm_NN('CGTTCCAAAGATGTGGGCATGAGCTTAC'))
+    60.32
+    >>> print('%0.2f' % mt.Tm_NN('CGTTCCAAAGATGTGGGCATGAGCTTAC',
+    ...                    c_seq='GCAAGGcTTCTACACCCGTACTCGAATG'))
+    55.39
+    >>> print('%0.2f' % mt.Tm_NN('CGTTCCAAAGATGTGGGCATGAGCTTAC', shift=1,
+    ...                   c_seq='TGCAAGGcTTCTACACCCGTACTCGAATGC'))
+    55.69
+
+The same for RNA:
+
+    >>> print('%0.2f' % mt.Tm_NN('CGUUCCAAAGAUGUGGGCAUGAGCUUAC',
+    ...                   c_seq='UGCAAGGcUUCUACACCCGUACUCGAAUGC',
+    ...                   shift=1, nn_table=mt.RNA_NN3,
+    ...                   de_table=mt.RNA_DE1))
+    73.00
+
+Note, that thermodynamic data are not available for all kind of mismatches,
+e.g. most double mismatches or terminal mismatches combined with dangling ends:
+
+    >>> print('%0.2f' % mt.Tm_NN('CGTTCCAAAGATGTGGGCATGAGCTTAC',
+    ...                   c_seq='TtCAAGGcTTCTACACCCGTACTCGAATGC',
+    ...                   shift=1))
+    Traceback (most recent call last):
+    ValueError: no thermodynamic data for neighbors '.C/TT' available
+
+Make your own tables, or update/extend existing tables. E.g., add values for
+locked nucleotides. Here, 'locked A' (and its complement) should be represented
+by '1':
+
+    >>> mytable = mt.make_table(oldtable=mt.DNA_NN3,
+    ...                         values={'A1/T1':(-6.608, -17.235),
+    ...                         '1A/1T':(-6.893, -15.923)})
+    >>> print('%0.2f' % mt.Tm_NN('CGTTCCAAAGATGTGGGCATGAGCTTAC'))
+    60.32
+    >>> print('%0.2f' % mt.Tm_NN('CGTTCCA1AGATGTGGGCATGAGCTTAC',
+    ...                           nn_table=mytable, check=False))
+    ... # 'check' must be False, otherwise '1' would be discarded
+    62.53
+
+"""
+
+
+import math
+import warnings
+
+from Bio import SeqUtils, Seq
+from Bio import BiopythonWarning
+from Bio import BiopythonDeprecationWarning
+
+
+# Thermodynamic lookup tables (dictionaries):
+# Enthalpy (dH) and entropy (dS) values for nearest neighbors and initiation
+# process. Calculation of duplex initiation is quite different in several
+# papers; to allow for a general calculation, all different initiation
+# parameters are included in all tables and non-applicable parameters are set
+# to zero.
+# The key is either an initiation type (e.g., 'init_A/T') or a nearest neighbor
+# duplex sequence (e.g., GT/CA, to read 5'GT3'-3'CA5'). The values are tuples
+# of dH (kcal/mol), dS (cal/mol K).
+
+# Turn black code style off
+# fmt: off
+
+# DNA/DNA
+# Breslauer et al. (1986), Proc Natl Acad Sci USA 83: 3746-3750
+DNA_NN1 = {
+    "init": (0, 0), "init_A/T": (0, 0), "init_G/C": (0, 0),
+    "init_oneG/C": (0, -16.8), "init_allA/T": (0, -20.1), "init_5T/A": (0, 0),
+    "sym": (0, -1.3),
+    "AA/TT": (-9.1, -24.0), "AT/TA": (-8.6, -23.9), "TA/AT": (-6.0, -16.9),
+    "CA/GT": (-5.8, -12.9), "GT/CA": (-6.5, -17.3), "CT/GA": (-7.8, -20.8),
+    "GA/CT": (-5.6, -13.5), "CG/GC": (-11.9, -27.8), "GC/CG": (-11.1, -26.7),
+    "GG/CC": (-11.0, -26.6)}
+
+# Sugimoto et al. (1996), Nuc Acids Res 24 : 4501-4505
+DNA_NN2 = {
+    "init": (0.6, -9.0), "init_A/T": (0, 0), "init_G/C": (0, 0),
+    "init_oneG/C": (0, 0), "init_allA/T": (0, 0), "init_5T/A": (0, 0),
+    "sym": (0, -1.4),
+    "AA/TT": (-8.0, -21.9), "AT/TA": (-5.6, -15.2), "TA/AT": (-6.6, -18.4),
+    "CA/GT": (-8.2, -21.0), "GT/CA": (-9.4, -25.5), "CT/GA": (-6.6, -16.4),
+    "GA/CT": (-8.8, -23.5), "CG/GC": (-11.8, -29.0), "GC/CG": (-10.5, -26.4),
+    "GG/CC": (-10.9, -28.4)}
+
+# Allawi and SantaLucia (1997), Biochemistry 36: 10581-10594
+DNA_NN3 = {
+    "init": (0, 0), "init_A/T": (2.3, 4.1), "init_G/C": (0.1, -2.8),
+    "init_oneG/C": (0, 0), "init_allA/T": (0, 0), "init_5T/A": (0, 0),
+    "sym": (0, -1.4),
+    "AA/TT": (-7.9, -22.2), "AT/TA": (-7.2, -20.4), "TA/AT": (-7.2, -21.3),
+    "CA/GT": (-8.5, -22.7), "GT/CA": (-8.4, -22.4), "CT/GA": (-7.8, -21.0),
+    "GA/CT": (-8.2, -22.2), "CG/GC": (-10.6, -27.2), "GC/CG": (-9.8, -24.4),
+    "GG/CC": (-8.0, -19.9)}
+
+# SantaLucia & Hicks (2004), Annu. Rev. Biophys. Biomol. Struct 33: 415-440
+DNA_NN4 = {
+    "init": (0.2, -5.7), "init_A/T": (2.2, 6.9), "init_G/C": (0, 0),
+    "init_oneG/C": (0, 0), "init_allA/T": (0, 0), "init_5T/A": (0, 0),
+    "sym": (0, -1.4),
+    "AA/TT": (-7.6, -21.3), "AT/TA": (-7.2, -20.4), "TA/AT": (-7.2, -20.4),
+    "CA/GT": (-8.5, -22.7), "GT/CA": (-8.4, -22.4), "CT/GA": (-7.8, -21.0),
+    "GA/CT": (-8.2, -22.2), "CG/GC": (-10.6, -27.2), "GC/CG": (-9.8, -24.4),
+    "GG/CC": (-8.0, -19.0)}
+
+# RNA/RNA
+# Freier et al. (1986), Proc Natl Acad Sci USA 83: 9373-9377
+RNA_NN1 = {
+    "init": (0, -10.8), "init_A/T": (0, 0), "init_G/C": (0, 0),
+    "init_oneG/C": (0, 0), "init_allA/T": (0, 0), "init_5T/A": (0, 0),
+    "sym": (0, -1.4),
+    "AA/TT": (-6.6, -18.4), "AT/TA": (-5.7, -15.5), "TA/AT": (-8.1, -22.6),
+    "CA/GT": (-10.5, -27.8), "GT/CA": (-10.2, -26.2), "CT/GA": (-7.6, -19.2),
+    "GA/CT": (-13.3, -35.5), "CG/GC": (-8.0, -19.4), "GC/CG": (-14.2, -34.9),
+    "GG/CC": (-12.2, -29.7)}
+
+# Xia et al (1998), Biochemistry 37: 14719-14735
+RNA_NN2 = {
+    "init": (3.61, -1.5), "init_A/T": (3.72, 10.5), "init_G/C": (0, 0),
+    "init_oneG/C": (0, 0), "init_allA/T": (0, 0), "init_5T/A": (0, 0),
+    "sym": (0, -1.4),
+    "AA/TT": (-6.82, -19.0), "AT/TA": (-9.38, -26.7), "TA/AT": (-7.69, -20.5),
+    "CA/GT": (-10.44, -26.9), "GT/CA": (-11.40, -29.5),
+    "CT/GA": (-10.48, -27.1), "GA/CT": (-12.44, -32.5),
+    "CG/GC": (-10.64, -26.7), "GC/CG": (-14.88, -36.9),
+    "GG/CC": (-13.39, -32.7)}
+
+# Chen et al. (2012), Biochemistry 51: 3508-3522
+RNA_NN3 = {
+    "init": (6.40, 6.99), "init_A/T": (3.85, 11.04), "init_G/C": (0, 0),
+    "init_oneG/C": (0, 0), "init_allA/T": (0, 0), "init_5T/A": (0, 0),
+    "sym": (0, -1.4),
+    "AA/TT": (-7.09, -19.8), "AT/TA": (-9.11, -25.8), "TA/AT": (-8.50, -22.9),
+    "CA/GT": (-11.03, -28.8), "GT/CA": (-11.98, -31.3),
+    "CT/GA": (-10.90, -28.5), "GA/CT": (-13.21, -34.9),
+    "CG/GC": (-10.88, -27.4), "GC/CG": (-16.04, -40.6),
+    "GG/CC": (-14.18, -35.0), "GT/TG": (-13.83, -46.9),
+    "GG/TT": (-17.82, -56.7), "AG/TT": (-3.96, -11.6),
+    "TG/AT": (-0.96, -1.8), "TT/AG": (-10.38, -31.8), "TG/GT": (-12.64, -38.9),
+    "AT/TG": (-7.39, -21.0), "CG/GT": (-5.56, -13.9), "CT/GG": (-9.44, -24.7),
+    "GG/CT": (-7.03, -16.8), "GT/CG": (-11.09, -28.8)}
+
+# RNA/DNA
+# Sugimoto et al. (1995), Biochemistry 34: 11211-11216
+R_DNA_NN1 = {
+    "init": (1.9, -3.9), "init_A/T": (0, 0), "init_G/C": (0, 0),
+    "init_oneG/C": (0, 0), "init_allA/T": (0, 0), "init_5T/A": (0, 0),
+    "sym": (0, 0),
+    "TT/AA": (-11.5, -36.4), "GT/CA": (-7.8, -21.6), "CT/GA": (-7.0, -19.7),
+    "AT/TA": (-8.3, -23.9), "TG/AC": (-10.4, -28.4), "GG/CC": (-12.8, -31.9),
+    "CG/GC": (-16.3, -47.1), "AG/TC": (-9.1, -23.5), "TC/AG": (-8.6, -22.9),
+    "GC/CG": (-8.0, -17.1), "CC/GG": (-9.3, -23.2), "AC/TG": (-5.9, -12.3),
+    "TA/AT": (-7.8, -23.2), "GA/CT": (-5.5, -13.5), "CA/GT": (-9.0, -26.1),
+    "AA/TT": (-7.8, -21.9)}
+
+# Internal mismatch and inosine table (DNA)
+# Allawi & SantaLucia (1997), Biochemistry 36: 10581-10594
+# Allawi & SantaLucia (1998), Biochemistry 37: 9435-9444
+# Allawi & SantaLucia (1998), Biochemistry 37: 2170-2179
+# Allawi & SantaLucia (1998), Nucl Acids Res 26: 2694-2701
+# Peyret et al. (1999), Biochemistry 38: 3468-3477
+# Watkins & SantaLucia (2005), Nucl Acids Res 33: 6258-6267
+DNA_IMM1 = {
+    "AG/TT": (1.0, 0.9), "AT/TG": (-2.5, -8.3), "CG/GT": (-4.1, -11.7),
+    "CT/GG": (-2.8, -8.0), "GG/CT": (3.3, 10.4), "GG/TT": (5.8, 16.3),
+    "GT/CG": (-4.4, -12.3), "GT/TG": (4.1, 9.5), "TG/AT": (-0.1, -1.7),
+    "TG/GT": (-1.4, -6.2), "TT/AG": (-1.3, -5.3), "AA/TG": (-0.6, -2.3),
+    "AG/TA": (-0.7, -2.3), "CA/GG": (-0.7, -2.3), "CG/GA": (-4.0, -13.2),
+    "GA/CG": (-0.6, -1.0), "GG/CA": (0.5, 3.2), "TA/AG": (0.7, 0.7),
+    "TG/AA": (3.0, 7.4),
+    "AC/TT": (0.7, 0.2), "AT/TC": (-1.2, -6.2), "CC/GT": (-0.8, -4.5),
+    "CT/GC": (-1.5, -6.1), "GC/CT": (2.3, 5.4), "GT/CC": (5.2, 13.5),
+    "TC/AT": (1.2, 0.7), "TT/AC": (1.0, 0.7),
+    "AA/TC": (2.3, 4.6), "AC/TA": (5.3, 14.6), "CA/GC": (1.9, 3.7),
+    "CC/GA": (0.6, -0.6), "GA/CC": (5.2, 14.2), "GC/CA": (-0.7, -3.8),
+    "TA/AC": (3.4, 8.0), "TC/AA": (7.6, 20.2),
+    "AA/TA": (1.2, 1.7), "CA/GA": (-0.9, -4.2), "GA/CA": (-2.9, -9.8),
+    "TA/AA": (4.7, 12.9), "AC/TC": (0.0, -4.4), "CC/GC": (-1.5, -7.2),
+    "GC/CC": (3.6, 8.9), "TC/AC": (6.1, 16.4), "AG/TG": (-3.1, -9.5),
+    "CG/GG": (-4.9, -15.3), "GG/CG": (-6.0, -15.8), "TG/AG": (1.6, 3.6),
+    "AT/TT": (-2.7, -10.8), "CT/GT": (-5.0, -15.8), "GT/CT": (-2.2, -8.4),
+    "TT/AT": (0.2, -1.5),
+    "AI/TC": (-8.9, -25.5), "TI/AC": (-5.9, -17.4), "AC/TI": (-8.8, -25.4),
+    "TC/AI": (-4.9, -13.9), "CI/GC": (-5.4, -13.7), "GI/CC": (-6.8, -19.1),
+    "CC/GI": (-8.3, -23.8), "GC/CI": (-5.0, -12.6),
+    "AI/TA": (-8.3, -25.0), "TI/AA": (-3.4, -11.2), "AA/TI": (-0.7, -2.6),
+    "TA/AI": (-1.3, -4.6), "CI/GA": (2.6, 8.9), "GI/CA": (-7.8, -21.1),
+    "CA/GI": (-7.0, -20.0), "GA/CI": (-7.6, -20.2),
+    "AI/TT": (0.49, -0.7), "TI/AT": (-6.5, -22.0), "AT/TI": (-5.6, -18.7),
+    "TT/AI": (-0.8, -4.3), "CI/GT": (-1.0, -2.4), "GI/CT": (-3.5, -10.6),
+    "CT/GI": (0.1, -1.0), "GT/CI": (-4.3, -12.1),
+    "AI/TG": (-4.9, -15.8), "TI/AG": (-1.9, -8.5), "AG/TI": (0.1, -1.8),
+    "TG/AI": (1.0, 1.0), "CI/GG": (7.1, 21.3), "GI/CG": (-1.1, -3.2),
+    "CG/GI": (5.8, 16.9), "GG/CI": (-7.6, -22.0),
+    "AI/TI": (-3.3, -11.9), "TI/AI": (0.1, -2.3), "CI/GI": (1.3, 3.0),
+    "GI/CI": (-0.5, -1.3)}
+
+# Terminal mismatch table (DNA)
+# SantaLucia & Peyret (2001) Patent Application WO 01/94611
+DNA_TMM1 = {
+    "AA/TA": (-3.1, -7.8), "TA/AA": (-2.5, -6.3), "CA/GA": (-4.3, -10.7),
+    "GA/CA": (-8.0, -22.5),
+    "AC/TC": (-0.1, 0.5), "TC/AC": (-0.7, -1.3), "CC/GC": (-2.1, -5.1),
+    "GC/CC": (-3.9, -10.6),
+    "AG/TG": (-1.1, -2.1), "TG/AG": (-1.1, -2.7), "CG/GG": (-3.8, -9.5),
+    "GG/CG": (-0.7, -19.2),
+    "AT/TT": (-2.4, -6.5), "TT/AT": (-3.2, -8.9), "CT/GT": (-6.1, -16.9),
+    "GT/CT": (-7.4, -21.2),
+    "AA/TC": (-1.6, -4.0), "AC/TA": (-1.8, -3.8), "CA/GC": (-2.6, -5.9),
+    "CC/GA": (-2.7, -6.0), "GA/CC": (-5.0, -13.8), "GC/CA": (-3.2, -7.1),
+    "TA/AC": (-2.3, -5.9), "TC/AA": (-2.7, -7.0),
+    "AC/TT": (-0.9, -1.7), "AT/TC": (-2.3, -6.3), "CC/GT": (-3.2, -8.0),
+    "CT/GC": (-3.9, -10.6), "GC/CT": (-4.9, -13.5), "GT/CC": (-3.0, -7.8),
+    "TC/AT": (-2.5, -6.3), "TT/AC": (-0.7, -1.2),
+    "AA/TG": (-1.9, -4.4), "AG/TA": (-2.5, -5.9), "CA/GG": (-3.9, -9.6),
+    "CG/GA": (-6.0, -15.5), "GA/CG": (-4.3, -11.1), "GG/CA": (-4.6, -11.4),
+    "TA/AG": (-2.0, -4.7), "TG/AA": (-2.4, -5.8),
+    "AG/TT": (-3.2, -8.7), "AT/TG": (-3.5, -9.4), "CG/GT": (-3.8, -9.0),
+    "CT/GG": (-6.6, -18.7), "GG/CT": (-5.7, -15.9), "GT/CG": (-5.9, -16.1),
+    "TG/AT": (-3.9, -10.5), "TT/AG": (-3.6, -9.8)}
+
+# Dangling ends table (DNA)
+# Bommarito et al. (2000), Nucl Acids Res 28: 1929-1934
+DNA_DE1 = {
+    "AA/.T": (0.2, 2.3), "AC/.G": (-6.3, -17.1), "AG/.C": (-3.7, -10.0),
+    "AT/.A": (-2.9, -7.6), "CA/.T": (0.6, 3.3), "CC/.G": (-4.4, -12.6),
+    "CG/.C": (-4.0, -11.9), "CT/.A": (-4.1, -13.0), "GA/.T": (-1.1, -1.6),
+    "GC/.G": (-5.1, -14.0), "GG/.C": (-3.9, -10.9), "GT/.A": (-4.2, -15.0),
+    "TA/.T": (-6.9, -20.0), "TC/.G": (-4.0, -10.9), "TG/.C": (-4.9, -13.8),
+    "TT/.A": (-0.2, -0.5),
+    ".A/AT": (-0.7, -0.8), ".C/AG": (-2.1, -3.9), ".G/AC": (-5.9, -16.5),
+    ".T/AA": (-0.5, -1.1), ".A/CT": (4.4, 14.9), ".C/CG": (-0.2, -0.1),
+    ".G/CC": (-2.6, -7.4), ".T/CA": (4.7, 14.2), ".A/GT": (-1.6, -3.6),
+    ".C/GG": (-3.9, -11.2), ".G/GC": (-3.2, -10.4), ".T/GA": (-4.1, -13.1),
+    ".A/TT": (2.9, 10.4), ".C/TG": (-4.4, -13.1), ".G/TC": (-5.2, -15.0),
+    ".T/TA": (-3.8, -12.6)}
+
+# Dangling ends table (RNA)
+# Turner & Mathews (2010), Nucl Acids Res 38: D280-D282
+RNA_DE1 = {
+    ".T/AA": (-4.9, -13.2), ".T/CA": (-0.9, -1.3), ".T/GA": (-5.5, -15.1),
+    ".T/TA": (-2.3, -5.5),
+    ".G/AC": (-9.0, -23.5), ".G/CC": (-4.1, -10.6), ".G/GC": (-8.6, -22.2),
+    ".G/TC": (-7.5, -20.31),
+    ".C/AG": (-7.4, -20.3), ".C/CG": (-2.8, -7.7), ".C/GG": (-6.4, -16.4),
+    ".C/TG": (-3.6, -9.7),
+    ".T/AG": (-4.9, -13.2), ".T/CG": (-0.9, -1.3), ".T/GG": (-5.5, -15.1),
+    ".T/TG": (-2.3, -5.5),
+    ".A/AT": (-5.7, -16.1), ".A/CT": (-0.7, -1.9), ".A/GT": (-5.8, -16.4),
+    ".A/TT": (-2.2, -6.8),
+    ".G/AT": (-5.7, -16.1), ".G/CT": (-0.7, -1.9), ".G/GT": (-5.8, -16.4),
+    ".G/TT": (-2.2, -6.8),
+    "AT/.A": (-0.5, -0.6), "CT/.A": (6.9, 22.6), "GT/.A": (0.6, 2.6),
+    "TT/.A": (0.6, 2.6),
+    "AG/.C": (-1.6, -4.5), "CG/.C": (0.7, 3.2), "GG/.C": (-4.6, -14.8),
+    "TG/.C": (-0.4, -1.3),
+    "AC/.G": (-2.4, -6.1), "CC/.G": (3.3, 11.6), "GC/.G": (0.8, 3.2),
+    "TC/.G": (-1.4, -4.2),
+    "AT/.G": (-0.5, -0.6), "CT/.G": (6.9, 22.6), "GT/.G": (0.6, 2.6),
+    "TT/.G": (0.6, 2.6),
+    "AA/.T": (1.6, 6.1), "CA/.T": (2.2, 8.1), "GA/.T": (0.7, 3.5),
+    "TA/.T": (3.1, 10.6),
+    "AG/.T": (1.6, 6.1), "CG/.T": (2.2, 8.1), "GG/.T": (0.7, 3.5),
+    "TG/.T": (3.1, 10.6)}
+
+# Turn black code style on
+# fmt: on
+
+
+def make_table(oldtable=None, values=None):
+    """Return a table with thermodynamic parameters (as dictionary).
+
+    Arguments:
+     - oldtable: An existing dictionary with thermodynamic parameters.
+     - values: A dictionary with new or updated values.
+
+    E.g., to replace the initiation parameters in the Sugimoto '96 dataset with
+    the initiation parameters from Allawi & SantaLucia '97:
+
+    >>> from Bio.SeqUtils.MeltingTemp import make_table, DNA_NN2
+    >>> table = DNA_NN2                               # Sugimoto '96
+    >>> table['init_A/T']
+    (0, 0)
+    >>> newtable = make_table(oldtable=DNA_NN2, values={'init': (0, 0),
+    ...                       'init_A/T': (2.3, 4.1),
+    ...                       'init_G/C': (0.1, -2.8)})
+    >>> print("%0.1f, %0.1f" % newtable['init_A/T'])
+    2.3, 4.1
+
+    """
+    if oldtable is None:
+        table = {
+            "init": (0, 0),
+            "init_A/T": (0, 0),
+            "init_G/C": (0, 0),
+            "init_oneG/C": (0, 0),
+            "init_allA/T": (0, 0),
+            "init_5T/A": (0, 0),
+            "sym": (0, 0),
+            "AA/TT": (0, 0),
+            "AT/TA": (0, 0),
+            "TA/AT": (0, 0),
+            "CA/GT": (0, 0),
+            "GT/CA": (0, 0),
+            "CT/GA": (0, 0),
+            "GA/CT": (0, 0),
+            "CG/GC": (0, 0),
+            "GC/CG": (0, 0),
+            "GG/CC": (0, 0),
+        }
+    else:
+        table = oldtable.copy()
+    if values:
+        table.update(values)
+    return table
+
+
+def _check(seq, method):
+    """Return a sequence which fullfils the requirements of the given method (PRIVATE).
+
+    All Tm methods in this package require the sequence in uppercase format.
+    Most methods make use of the length of the sequence (directly or
+    indirectly), which can only be expressed as len(seq) if the sequence does
+    not contain whitespaces and other non-base characters. RNA sequences are
+    backtranscribed to DNA. This method is PRIVATE.
+
+    Arguments:
+     - seq: The sequence as given by the user (passed as string).
+     - method: Tm_Wallace, Tm_GC or Tm_NN.
+
+    >>> from Bio.SeqUtils import MeltingTemp as mt
+    >>> mt._check('10 ACGTTGCAAG tccatggtac', 'Tm_NN')
+    'ACGTTGCAAGTCCATGGTAC'
+
+    """
+    seq = "".join(seq.split()).upper()
+    seq = str(Seq.Seq(seq).back_transcribe())
+    if method == "Tm_Wallace":
+        return seq
+    if method == "Tm_GC":
+        baseset = (
+            "A",
+            "B",
+            "C",
+            "D",
+            "G",
+            "H",
+            "I",
+            "K",
+            "M",
+            "N",
+            "R",
+            "S",
+            "T",
+            "V",
+            "W",
+            "X",
+            "Y",
+        )
+    if method == "Tm_NN":
+        baseset = ("A", "C", "G", "T", "I")
+    seq = "".join([base for base in seq if base in baseset])
+    return seq
+
+
+def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None):
+    """Calculate a term to correct Tm for salt ions.
+
+    Depending on the Tm calculation, the term will correct Tm or entropy. To
+    calculate corrected Tm values, different operations need to be applied:
+
+     - methods 1-4: Tm(new) = Tm(old) + corr
+     - method 5: deltaS(new) = deltaS(old) + corr
+     - methods 6+7: Tm(new) = 1/(1/Tm(old) + corr)
+
+    Arguments:
+     - Na, K, Tris, Mg, dNTPS: Millimolar concentration of respective ion. To
+       have a simple 'salt correction', just pass Na. If any of K, Tris, Mg and
+       dNTPS is non-zero, a 'sodium-equivalent' concentration is calculated
+       according to von Ahsen et al. (2001, Clin Chem 47: 1956-1961):
+       [Na_eq] = [Na+] + [K+] + [Tris]/2 + 120*([Mg2+] - [dNTPs])^0.5
+       If [dNTPs] >= [Mg2+]: [Na_eq] = [Na+] + [K+] + [Tris]/2
+     - method: Which method to be applied. Methods 1-4 correct Tm, method 5
+       corrects deltaS, methods 6 and 7 correct 1/Tm. The methods are:
+
+       1. 16.6 x log[Na+]
+          (Schildkraut & Lifson (1965), Biopolymers 3: 195-208)
+       2. 16.6 x log([Na+]/(1.0 + 0.7*[Na+]))
+          (Wetmur (1991), Crit Rev Biochem Mol Biol 126: 227-259)
+       3. 12.5 x log(Na+]
+          (SantaLucia et al. (1996), Biochemistry 35: 3555-3562
+       4. 11.7 x log[Na+]
+          (SantaLucia (1998), Proc Natl Acad Sci USA 95: 1460-1465
+       5. Correction for deltaS: 0.368 x (N-1) x ln[Na+]
+          (SantaLucia (1998), Proc Natl Acad Sci USA 95: 1460-1465)
+       6. (4.29(%GC)-3.95)x1e-5 x ln[Na+] + 9.40e-6 x ln[Na+]^2
+          (Owczarzy et al. (2004), Biochemistry 43: 3537-3554)
+       7. Complex formula with decision tree and 7 empirical constants.
+          Mg2+ is corrected for dNTPs binding (if present)
+          (Owczarzy et al. (2008), Biochemistry 47: 5336-5353)
+
+    Examples
+    --------
+    >>> from Bio.SeqUtils import MeltingTemp as mt
+    >>> print('%0.2f' % mt.salt_correction(Na=50, method=1))
+    -21.60
+    >>> print('%0.2f' % mt.salt_correction(Na=50, method=2))
+    -21.85
+    >>> print('%0.2f' % mt.salt_correction(Na=100, Tris=20, method=2))
+    -16.45
+    >>> print('%0.2f' % mt.salt_correction(Na=100, Tris=20, Mg=1.5, method=2))
+    -10.99
+
+    """
+    if method in (5, 6, 7) and not seq:
+        raise ValueError(
+            "sequence is missing (is needed to calculate GC content or sequence length)."
+        )
+    if seq:
+        seq = str(seq)
+    corr = 0
+    if not method:
+        return corr
+    Mon = Na + K + Tris / 2.0  # Note: all these values are millimolar
+    mg = Mg * 1e-3  # Lowercase ions (mg, mon, dntps) are molar
+    # Na equivalent according to von Ahsen et al. (2001):
+    if sum((K, Mg, Tris, dNTPs)) > 0 and not method == 7 and dNTPs < Mg:
+        # dNTPs bind Mg2+ strongly. If [dNTPs] is larger or equal than
+        # [Mg2+], free Mg2+ is considered not to be relevant.
+        Mon += 120 * math.sqrt(Mg - dNTPs)
+    mon = Mon * 1e-3
+    # Note: math.log = ln(), math.log10 = log()
+    if method in range(1, 7) and not mon:
+        raise ValueError(
+            "Total ion concentration of zero is not allowed in this method."
+        )
+    if method == 1:
+        corr = 16.6 * math.log10(mon)
+    if method == 2:
+        corr = 16.6 * math.log10((mon) / (1.0 + 0.7 * (mon)))
+    if method == 3:
+        corr = 12.5 * math.log10(mon)
+    if method == 4:
+        corr = 11.7 * math.log10(mon)
+    if method == 5:
+        corr = 0.368 * (len(seq) - 1) * math.log(mon)
+    if method == 6:
+        corr = (
+            (4.29 * SeqUtils.GC(seq) / 100 - 3.95) * 1e-5 * math.log(mon)
+        ) + 9.40e-6 * math.log(mon) ** 2
+    # Turn black code style off
+    # fmt: off
+    if method == 7:
+        a, b, c, d = 3.92, -0.911, 6.26, 1.42
+        e, f, g = -48.2, 52.5, 8.31
+        if dNTPs > 0:
+            dntps = dNTPs * 1e-3
+            ka = 3e4  # Dissociation constant for Mg:dNTP
+            # Free Mg2+ calculation:
+            mg = (-(ka * dntps - ka * mg + 1.0)
+                  + math.sqrt((ka * dntps - ka * mg + 1.0) ** 2
+                              + 4.0 * ka * mg)) / (2.0 * ka)
+        if Mon > 0:
+            R = math.sqrt(mg) / mon
+            if R < 0.22:
+                corr = (4.29 * SeqUtils.GC(seq) / 100 - 3.95) * \
+                    1e-5 * math.log(mon) + 9.40e-6 * math.log(mon) ** 2
+                return corr
+            elif R < 6.0:
+                a = 3.92 * (0.843 - 0.352 * math.sqrt(mon) * math.log(mon))
+                d = 1.42 * (1.279 - 4.03e-3 * math.log(mon)
+                            - 8.03e-3 * math.log(mon) ** 2)
+                g = 8.31 * (0.486 - 0.258 * math.log(mon)
+                            + 5.25e-3 * math.log(mon) ** 3)
+        corr = (a + b * math.log(mg) + (SeqUtils.GC(seq) / 100)
+                * (c + d * math.log(mg)) + (1 / (2.0 * (len(seq) - 1)))
+                * (e + f * math.log(mg) + g * math.log(mg) ** 2)) * 1e-5
+    # Turn black code style on
+    # fmt: on
+    if method > 7:
+        raise ValueError("Allowed values for parameter 'method' are 1-7.")
+    return corr
+
+
+def chem_correction(
+    melting_temp, DMSO=0, fmd=0, DMSOfactor=0.75, fmdfactor=0.65, fmdmethod=1, GC=None
+):
+    """Correct a given Tm for DMSO and formamide.
+
+    Please note that these corrections are +/- rough approximations.
+
+    Arguments:
+     - melting_temp: Melting temperature.
+     - DMSO: Percent DMSO.
+     - fmd: Formamide concentration in %(fmdmethod=1) or molar (fmdmethod=2).
+     - DMSOfactor: How much should Tm decreases per percent DMSO. Default=0.65
+       (von Ahsen et al. 2001). Other published values are 0.5, 0.6 and 0.675.
+     - fmdfactor: How much should Tm decrease per percent formamide.
+       Default=0.65. Several papers report factors between 0.6 and 0.72.
+     - fmdmethod:
+
+         1. Tm = Tm - factor(%formamide) (Default)
+         2. Tm = Tm + (0.453(f(GC)) - 2.88) x [formamide]
+
+       Here f(GC) is fraction of GC.
+       Note (again) that in fmdmethod=1 formamide concentration is given in %,
+       while in fmdmethod=2 it is given in molar.
+     - GC: GC content in percent.
+
+    Examples:
+        >>> from Bio.SeqUtils import MeltingTemp as mt
+        >>> mt.chem_correction(70)
+        70
+        >>> print('%0.2f' % mt.chem_correction(70, DMSO=3))
+        67.75
+        >>> print('%0.2f' % mt.chem_correction(70, fmd=5))
+        66.75
+        >>> print('%0.2f' % mt.chem_correction(70, fmdmethod=2, fmd=1.25,
+        ...                                    GC=50))
+        66.68
+
+    """
+    if DMSO:
+        melting_temp -= DMSOfactor * DMSO
+    if fmd:
+        # McConaughy et al. (1969), Biochemistry 8: 3289-3295
+        if fmdmethod == 1:
+            # Note: Here fmd is given in percent
+            melting_temp -= fmdfactor * fmd
+        # Blake & Delcourt (1996), Nucl Acids Res 11: 2095-2103
+        if fmdmethod == 2:
+            if GC is None or GC < 0:
+                raise ValueError("'GC' is missing or negative")
+            # Note: Here fmd is given in molar
+            melting_temp += (0.453 * (GC / 100.0) - 2.88) * fmd
+        if fmdmethod not in (1, 2):
+            raise ValueError("'fmdmethod' must be 1 or 2")
+    return melting_temp
+
+
+def Tm_Wallace(seq, check=True, strict=True):
+    """Calculate and return the Tm using the 'Wallace rule'.
+
+    Tm = 4 degC * (G + C) + 2 degC * (A+T)
+
+    The Wallace rule (Thein & Wallace 1986, in Human genetic diseases: a
+    practical approach, 33-50) is often used as rule of thumb for approximate
+    Tm calculations for primers of 14 to 20 nt length.
+
+    Non-DNA characters (e.g., E, F, J, !, 1, etc) are ignored by this method.
+
+    Examples:
+        >>> from Bio.SeqUtils import MeltingTemp as mt
+        >>> mt.Tm_Wallace('ACGTTGCAATGCCGTA')
+        48.0
+        >>> mt.Tm_Wallace('ACGT TGCA ATGC CGTA')
+        48.0
+        >>> mt.Tm_Wallace('1ACGT2TGCA3ATGC4CGTA')
+        48.0
+
+    """
+    seq = str(seq)
+    if check:
+        seq = _check(seq, "Tm_Wallace")
+
+    melting_temp = 2 * (sum(map(seq.count, ("A", "T", "W")))) + 4 * (
+        sum(map(seq.count, ("C", "G", "S")))
+    )
+
+    # Intermediate values for ambiguous positions:
+    tmp = (
+        3 * (sum(map(seq.count, ("K", "M", "N", "R", "Y"))))
+        + 10 / 3.0 * (sum(map(seq.count, ("B", "V"))))
+        + 8 / 3.0 * (sum(map(seq.count, ("D", "H"))))
+    )
+    if strict and tmp:
+        raise ValueError(
+            "ambiguous bases B, D, H, K, M, N, R, V, Y not allowed when strict=True"
+        )
+    else:
+        melting_temp += tmp
+    return melting_temp
+
+
+def Tm_GC(
+    seq,
+    check=True,
+    strict=True,
+    valueset=7,
+    userset=None,
+    Na=50,
+    K=0,
+    Tris=0,
+    Mg=0,
+    dNTPs=0,
+    saltcorr=0,
+    mismatch=True,
+):
+    """Return the Tm using empirical formulas based on GC content.
+
+    General format: Tm = A + B(%GC) - C/N + salt correction - D(%mismatch)
+
+    A, B, C, D: empirical constants, N: primer length
+    D (amount of decrease in Tm per % mismatch) is often 1, but sometimes other
+    values have been used (0.6-1.5). Use 'X' to indicate the mismatch position
+    in the sequence. Note that this mismatch correction is a rough estimate.
+
+    >>> from Bio.SeqUtils import MeltingTemp as mt
+    >>> print("%0.2f" % mt.Tm_GC('CTGCTGATXGCACGAGGTTATGG', valueset=2))
+    69.20
+
+    Arguments:
+     - valueset: A few often cited variants are included:
+
+        1. Tm = 69.3 + 0.41(%GC) - 650/N
+           (Marmur & Doty 1962, J Mol Biol 5: 109-118; Chester & Marshak 1993),
+           Anal Biochem 209: 284-290)
+        2. Tm = 81.5 + 0.41(%GC) - 675/N - %mismatch
+           'QuikChange' formula. Recommended (by the manufacturer) for the
+           design of primers for QuikChange mutagenesis.
+        3. Tm = 81.5 + 0.41(%GC) - 675/N + 16.6 x log[Na+]
+           (Marmur & Doty 1962, J Mol Biol 5: 109-118; Schildkraut & Lifson
+           1965, Biopolymers 3: 195-208)
+        4. Tm = 81.5 + 0.41(%GC) - 500/N + 16.6 x log([Na+]/(1.0 + 0.7 x
+           [Na+])) - %mismatch
+           (Wetmur 1991, Crit Rev Biochem Mol Biol 126: 227-259). This is the
+           standard formula in approximative mode of MELTING 4.3.
+        5. Tm = 78 + 0.7(%GC) - 500/N + 16.6 x log([Na+]/(1.0 + 0.7 x [Na+]))
+           - %mismatch
+           (Wetmur 1991, Crit Rev Biochem Mol Biol 126: 227-259). For RNA.
+        6. Tm = 67 + 0.8(%GC) - 500/N + 16.6 x log([Na+]/(1.0 + 0.7 x [Na+]))
+           - %mismatch
+           (Wetmur 1991, Crit Rev Biochem Mol Biol 126: 227-259). For RNA/DNA
+           hybrids.
+        7. Tm = 81.5 + 0.41(%GC) - 600/N + 16.6 x log[Na+]
+           Used by Primer3Plus to calculate the product Tm. Default set.
+        8. Tm = 77.1 + 0.41(%GC) - 528/N + 11.7 x log[Na+]
+           (von Ahsen et al. 2001, Clin Chem 47: 1956-1961). Recommended 'as a
+           tradeoff between accuracy and ease of use'.
+
+     - userset: Tuple of four values for A, B, C, and D. Usersets override
+       valuesets.
+     - Na, K, Tris, Mg, dNTPs: Concentration of the respective ions [mM]. If
+       any of K, Tris, Mg and dNTPS is non-zero, a 'sodium-equivalent'
+       concentration is calculated and used for salt correction (von Ahsen et
+       al., 2001).
+     - saltcorr: Type of salt correction (see method salt_correction).
+       Default=5. 0 or None means no salt correction.
+     - mismatch: If 'True' (default) every 'X' in the sequence is counted as
+       mismatch.
+
+    """
+    if saltcorr == 5:
+        raise ValueError("salt-correction method 5 not applicable to Tm_GC")
+    seq = str(seq)
+    if check:
+        seq = _check(seq, "Tm_GC")
+    percent_gc = SeqUtils.GC(seq)
+    # Ambiguous bases: add 0.5, 0.67 or 0.33% depending on G+C probability:
+    tmp = (
+        sum(map(seq.count, ("K", "M", "N", "R", "Y"))) * 50.0 / len(seq)
+        + sum(map(seq.count, ("B", "V"))) * 66.67 / len(seq)
+        + sum(map(seq.count, ("D", "H"))) * 33.33 / len(seq)
+    )
+    if strict and tmp:
+        raise ValueError(
+            "ambiguous bases B, D, H, K, M, N, R, V, Y not allowed when 'strict=True'"
+        )
+    else:
+        percent_gc += tmp
+    if userset:
+        A, B, C, D = userset
+    else:
+        if valueset == 1:
+            A, B, C, D = (69.3, 0.41, 650, 1)
+            saltcorr = 0
+        if valueset == 2:
+            A, B, C, D = (81.5, 0.41, 675, 1)
+            saltcorr = 0
+        if valueset == 3:
+            A, B, C, D = (81.5, 0.41, 675, 1)
+            saltcorr = 2
+        if valueset == 4:
+            A, B, C, D = (81.5, 0.41, 500, 1)
+            saltcorr = 3
+        if valueset == 5:
+            A, B, C, D = (78.0, 0.7, 500, 1)
+            saltcorr = 3
+        if valueset == 6:
+            A, B, C, D = (67.0, 0.8, 500, 1)
+            saltcorr = 3
+        if valueset == 7:
+            A, B, C, D = (81.5, 0.41, 600, 1)
+            saltcorr = 2
+        if valueset == 8:
+            A, B, C, D = (77.1, 0.41, 528, 1)
+            saltcorr = 4
+    if valueset > 8:
+        raise ValueError("allowed values for parameter 'valueset' are 0-8.")
+
+    melting_temp = A + B * percent_gc - C / (len(seq) * 1.0)
+    if saltcorr:
+        melting_temp += salt_correction(
+            Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs, seq=seq, method=saltcorr
+        )
+    if mismatch:
+        melting_temp -= D * (seq.count("X") * 100.0 / len(seq))
+    return melting_temp
+
+
+def _key_error(neighbors, strict):
+    """Throw an error or a warning if there is no data for the neighbors (PRIVATE)."""
+    # We haven't found the key in the tables
+    if strict:
+        raise ValueError("no thermodynamic data for neighbors %r available" % neighbors)
+    else:
+        warnings.warn(
+            "no themodynamic data for neighbors %r available. "
+            "Calculation will be wrong" % neighbors,
+            BiopythonWarning,
+        )
+
+
+def Tm_NN(
+    seq,
+    check=True,
+    strict=True,
+    c_seq=None,
+    shift=0,
+    nn_table=None,
+    tmm_table=None,
+    imm_table=None,
+    de_table=None,
+    dnac1=25,
+    dnac2=25,
+    selfcomp=False,
+    Na=50,
+    K=0,
+    Tris=0,
+    Mg=0,
+    dNTPs=0,
+    saltcorr=5,
+):
+    """Return the Tm using nearest neighbor thermodynamics.
+
+    Arguments:
+     - seq: The primer/probe sequence as string or Biopython sequence object.
+       For RNA/DNA hybridizations seq must be the RNA sequence.
+     - c_seq: Complementary sequence. The sequence of the template/target in
+       3'->5' direction. c_seq is necessary for mismatch correction and
+       dangling-ends correction. Both corrections will automatically be
+       applied if mismatches or dangling ends are present. Default=None.
+     - shift: Shift of the primer/probe sequence on the template/target
+       sequence, e.g.::
+
+                           shift=0       shift=1        shift= -1
+        Primer (seq):      5' ATGC...    5'  ATGC...    5' ATGC...
+        Template (c_seq):  3' TACG...    3' CTACG...    3'  ACG...
+
+       The shift parameter is necessary to align seq and c_seq if they have
+       different lengths or if they should have dangling ends. Default=0
+     - table: Thermodynamic NN values, eight tables are implemented:
+       For DNA/DNA hybridizations:
+
+        - DNA_NN1: values from Breslauer et al. (1986)
+        - DNA_NN2: values from Sugimoto et al. (1996)
+        - DNA_NN3: values from Allawi & SantaLucia (1997) (default)
+        - DNA_NN4: values from SantaLucia & Hicks (2004)
+
+       For RNA/RNA hybridizations:
+
+        - RNA_NN1: values from Freier et al. (1986)
+        - RNA_NN2: values from Xia et al. (1998)
+        - RNA_NN3: valuse from Chen et al. (2012)
+
+       For RNA/DNA hybridizations:
+
+        - R_DNA_NN1: values from Sugimoto et al. (1995)
+          Note that ``seq`` must be the RNA sequence.
+
+       Use the module's maketable method to make a new table or to update one
+       one of the implemented tables.
+     - tmm_table: Thermodynamic values for terminal mismatches.
+       Default: DNA_TMM1 (SantaLucia & Peyret, 2001)
+     - imm_table: Thermodynamic values for internal mismatches, may include
+       insosine mismatches. Default: DNA_IMM1 (Allawi & SantaLucia, 1997-1998;
+       Peyret et al., 1999; Watkins & SantaLucia, 2005)
+     - de_table: Thermodynamic values for dangling ends:
+
+        - DNA_DE1: for DNA. Values from Bommarito et al. (2000) (default)
+        - RNA_DE1: for RNA. Values from Turner & Mathews (2010)
+
+     - dnac1: Concentration of the higher concentrated strand [nM]. Typically
+       this will be the primer (for PCR) or the probe. Default=25.
+     - dnac2: Concentration of the lower concentrated strand [nM]. In PCR this
+       is the template strand which concentration is typically very low and may
+       be ignored (dnac2=0). In oligo/oligo hybridization experiments, dnac1
+       equals dnac1. Default=25.
+       MELTING and Primer3Plus use k = [Oligo(Total)]/4 by default. To mimic
+       this behaviour, you have to divide [Oligo(Total)] by 2 and assign this
+       concentration to dnac1 and dnac2. E.g., Total oligo concentration of
+       50 nM in Primer3Plus means dnac1=25, dnac2=25.
+     - selfcomp: Is the sequence self-complementary? Default=False. If 'True'
+       the primer is thought binding to itself, thus dnac2 is not considered.
+     - Na, K, Tris, Mg, dNTPs: See method 'Tm_GC' for details. Defaults: Na=50,
+       K=0, Tris=0, Mg=0, dNTPs=0.
+     - saltcorr: See method 'Tm_GC'. Default=5. 0 means no salt correction.
+
+    """
+    # Set defaults
+    if not nn_table:
+        nn_table = DNA_NN3
+    if not tmm_table:
+        tmm_table = DNA_TMM1
+    if not imm_table:
+        imm_table = DNA_IMM1
+    if not de_table:
+        de_table = DNA_DE1
+
+    seq = str(seq)
+    if not c_seq:
+        # c_seq must be provided by user if dangling ends or mismatches should
+        # be taken into account. Otherwise take perfect complement.
+        c_seq = Seq.Seq(seq).complement()
+    c_seq = str(c_seq)
+    if check:
+        seq = _check(seq, "Tm_NN")
+        c_seq = _check(c_seq, "Tm_NN")
+    tmp_seq = seq
+    tmp_cseq = c_seq
+    delta_h = 0
+    delta_s = 0
+    d_h = 0  # Names for indexes
+    d_s = 1  # 0 and 1
+
+    # Dangling ends?
+    if shift or len(seq) != len(c_seq):
+        # Align both sequences using the shift parameter
+        if shift > 0:
+            tmp_seq = "." * shift + seq
+        if shift < 0:
+            tmp_cseq = "." * abs(shift) + c_seq
+        if len(tmp_cseq) > len(tmp_seq):
+            tmp_seq += (len(tmp_cseq) - len(tmp_seq)) * "."
+        if len(tmp_cseq) < len(tmp_seq):
+            tmp_cseq += (len(tmp_seq) - len(tmp_cseq)) * "."
+        # Remove 'over-dangling' ends
+        while tmp_seq.startswith("..") or tmp_cseq.startswith(".."):
+            tmp_seq = tmp_seq[1:]
+            tmp_cseq = tmp_cseq[1:]
+        while tmp_seq.endswith("..") or tmp_cseq.endswith(".."):
+            tmp_seq = tmp_seq[:-1]
+            tmp_cseq = tmp_cseq[:-1]
+        # Now for the dangling ends
+        if tmp_seq.startswith(".") or tmp_cseq.startswith("."):
+            left_de = tmp_seq[:2] + "/" + tmp_cseq[:2]
+            try:
+                delta_h += de_table[left_de][d_h]
+                delta_s += de_table[left_de][d_s]
+            except KeyError:
+                _key_error(left_de, strict)
+            tmp_seq = tmp_seq[1:]
+            tmp_cseq = tmp_cseq[1:]
+        if tmp_seq.endswith(".") or tmp_cseq.endswith("."):
+            right_de = tmp_cseq[-2:][::-1] + "/" + tmp_seq[-2:][::-1]
+            try:
+                delta_h += de_table[right_de][d_h]
+                delta_s += de_table[right_de][d_s]
+            except KeyError:
+                _key_error(right_de, strict)
+            tmp_seq = tmp_seq[:-1]
+            tmp_cseq = tmp_cseq[:-1]
+
+    # Now for terminal mismatches
+    left_tmm = tmp_cseq[:2][::-1] + "/" + tmp_seq[:2][::-1]
+    if left_tmm in tmm_table:
+        delta_h += tmm_table[left_tmm][d_h]
+        delta_s += tmm_table[left_tmm][d_s]
+        tmp_seq = tmp_seq[1:]
+        tmp_cseq = tmp_cseq[1:]
+    right_tmm = tmp_seq[-2:] + "/" + tmp_cseq[-2:]
+    if right_tmm in tmm_table:
+        delta_h += tmm_table[right_tmm][d_h]
+        delta_s += tmm_table[right_tmm][d_s]
+        tmp_seq = tmp_seq[:-1]
+        tmp_cseq = tmp_cseq[:-1]
+
+    # Now everything 'unusual' at the ends is handled and removed and we can
+    # look at the initiation.
+    # One or several of the following initiation types may apply:
+
+    # Type: General initiation value
+    delta_h += nn_table["init"][d_h]
+    delta_s += nn_table["init"][d_s]
+
+    # Type: Duplex with no (allA/T) or at least one (oneG/C) GC pair
+    if SeqUtils.GC(seq) == 0:
+        delta_h += nn_table["init_allA/T"][d_h]
+        delta_s += nn_table["init_allA/T"][d_s]
+    else:
+        delta_h += nn_table["init_oneG/C"][d_h]
+        delta_s += nn_table["init_oneG/C"][d_s]
+
+    # Type: Penalty if 5' end is T
+    if seq.startswith("T"):
+        delta_h += nn_table["init_5T/A"][d_h]
+        delta_s += nn_table["init_5T/A"][d_s]
+    if seq.endswith("A"):
+        delta_h += nn_table["init_5T/A"][d_h]
+        delta_s += nn_table["init_5T/A"][d_s]
+
+    # Type: Different values for G/C or A/T terminal basepairs
+    ends = seq[0] + seq[-1]
+    AT = ends.count("A") + ends.count("T")
+    GC = ends.count("G") + ends.count("C")
+    delta_h += nn_table["init_A/T"][d_h] * AT
+    delta_s += nn_table["init_A/T"][d_s] * AT
+    delta_h += nn_table["init_G/C"][d_h] * GC
+    delta_s += nn_table["init_G/C"][d_s] * GC
+
+    # Finally, the 'zipping'
+    for basenumber in range(len(tmp_seq) - 1):
+        neighbors = (
+            tmp_seq[basenumber : basenumber + 2]
+            + "/"
+            + tmp_cseq[basenumber : basenumber + 2]
+        )
+        if neighbors in imm_table:
+            delta_h += imm_table[neighbors][d_h]
+            delta_s += imm_table[neighbors][d_s]
+        elif neighbors[::-1] in imm_table:
+            delta_h += imm_table[neighbors[::-1]][d_h]
+            delta_s += imm_table[neighbors[::-1]][d_s]
+        elif neighbors in nn_table:
+            delta_h += nn_table[neighbors][d_h]
+            delta_s += nn_table[neighbors][d_s]
+        elif neighbors[::-1] in nn_table:
+            delta_h += nn_table[neighbors[::-1]][d_h]
+            delta_s += nn_table[neighbors[::-1]][d_s]
+        else:
+            # We haven't found the key...
+            _key_error(neighbors, strict)
+
+    k = (dnac1 - (dnac2 / 2.0)) * 1e-9
+    if selfcomp:
+        k = dnac1 * 1e-9
+        delta_h += nn_table["sym"][d_h]
+        delta_s += nn_table["sym"][d_s]
+    R = 1.987  # universal gas constant in Cal/degrees C*Mol
+    if saltcorr:
+        corr = salt_correction(
+            Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs, method=saltcorr, seq=seq
+        )
+    if saltcorr == 5:
+        delta_s += corr
+    melting_temp = (1000 * delta_h) / (delta_s + (R * (math.log(k)))) - 273.15
+    if saltcorr in (1, 2, 3, 4):
+        melting_temp += corr
+    if saltcorr in (6, 7):
+        # Tm = 1/(1/Tm + corr)
+        melting_temp = 1 / (1 / (melting_temp + 273.15) + corr) - 273.15
+
+    return melting_temp
+
+
+def Tm_staluc(s, dnac=50, saltc=50, rna=0):
+    """Return DNA/DNA Tm using nearest neighbor thermodynamics (OBSOLETE).
+
+    This method may be depreceated in the future. Use Tm_NN instead. Tm_NN
+    with default values gives the same result as Tm_staluc.
+
+    s is the sequence as string or Seq object
+    dnac is DNA concentration [nM]
+    saltc is salt concentration [mM].
+    rna=0 is for DNA/DNA (default), use 1 for RNA/RNA hybridisation.
+
+    For DNA/DNA, see Allawi & SantaLucia (1997), Biochemistry 36: 10581-10594
+    For RNA/RNA, see Xia et al (1998), Biochemistry 37: 14719-14735
+
+    Examples
+    --------
+    >>> print("%0.2f" % Tm_staluc('CAGTCAGTACGTACGTGTACTGCCGTA'))
+    59.87
+    >>> print("%0.2f" % Tm_staluc('CAGTCAGTACGTACGTGTACTGCCGTA', rna=True))
+    77.90
+
+    You can also use a Seq object instead of a string,
+
+    >>> from Bio.Seq import Seq
+    >>> s = Seq('CAGTCAGTACGTACGTGTACTGCCGTA')
+    >>> print("%0.2f" % Tm_staluc(s))
+    59.87
+    >>> print("%0.2f" % Tm_staluc(s, rna=True))
+    77.90
+
+    """
+    # Original method was by Sebastian Bassi . It is
+    # now superseded by Tm_NN.
+
+    warnings.warn(
+        "Tm_staluc is deprecated; please use Tm_NN instead.",
+        BiopythonDeprecationWarning,
+    )
+    if not rna:
+        return Tm_NN(s, dnac1=dnac / 2.0, dnac2=dnac / 2.0, Na=saltc)
+    elif rna == 1:
+        return Tm_NN(s, dnac1=dnac / 2.0, dnac2=dnac / 2.0, Na=saltc, nn_table=RNA_NN2)
+    else:
+        raise ValueError(f"rna={rna} not supported")
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqUtils/ProtParam.py b/code/lib/Bio/SeqUtils/ProtParam.py
new file mode 100644
index 0000000..937e3c5
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/ProtParam.py
@@ -0,0 +1,356 @@
+# Copyright 2003 Yair Benita.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Simple protein analysis.
+
+Examples
+--------
+>>> from Bio.SeqUtils.ProtParam import ProteinAnalysis
+>>> X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGT"
+...                     "RDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEEC"
+...                     "LFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILF"
+...                     "LPLPV")
+>>> print(X.count_amino_acids()['A'])
+6
+>>> print(X.count_amino_acids()['E'])
+12
+>>> print("%0.2f" % X.get_amino_acids_percent()['A'])
+0.04
+>>> print("%0.2f" % X.get_amino_acids_percent()['L'])
+0.12
+>>> print("%0.2f" % X.molecular_weight())
+17103.16
+>>> print("%0.2f" % X.aromaticity())
+0.10
+>>> print("%0.2f" % X.instability_index())
+41.98
+>>> print("%0.2f" % X.isoelectric_point())
+7.72
+>>> sec_struc = X.secondary_structure_fraction()  # [helix, turn, sheet]
+>>> print("%0.2f" % sec_struc[0])  # helix
+0.28
+>>> epsilon_prot = X.molar_extinction_coefficient()  # [reduced, oxidized]
+>>> print(epsilon_prot[0])  # with reduced cysteines
+17420
+>>> print(epsilon_prot[1])  # with disulfid bridges
+17545
+
+Other public methods are:
+ - gravy
+ - protein_scale
+ - flexibility
+ - charge_at_pH
+
+"""
+
+
+import sys
+from Bio.SeqUtils import ProtParamData  # Local
+from Bio.SeqUtils import IsoelectricPoint  # Local
+from Bio.Seq import Seq
+from Bio.Data import IUPACData
+from Bio.SeqUtils import molecular_weight
+
+
+class ProteinAnalysis:
+    """Class containing methods for protein analysis.
+
+    The constructor takes two arguments.
+    The first is the protein sequence as a string, which is then converted to a
+    sequence object using the Bio.Seq module. This is done just to make sure
+    the sequence is a protein sequence and not anything else.
+
+    The second argument is optional. If set to True, the weight of the amino
+    acids will be calculated using their monoisotopic mass (the weight of the
+    most abundant isotopes for each element), instead of the average molecular
+    mass (the averaged weight of all stable isotopes for each element).
+    If set to false (the default value) or left out, the IUPAC average
+    molecular mass will be used for the calculation.
+
+    """
+
+    def __init__(self, prot_sequence, monoisotopic=False):
+        """Initialize the class."""
+        if prot_sequence.islower():
+            self.sequence = Seq(prot_sequence.upper())
+        else:
+            self.sequence = Seq(prot_sequence)
+        self.amino_acids_content = None
+        self.amino_acids_percent = None
+        self.length = len(self.sequence)
+        self.monoisotopic = monoisotopic
+
+    def count_amino_acids(self):
+        """Count standard amino acids, return a dict.
+
+        Counts the number times each amino acid is in the protein
+        sequence. Returns a dictionary {AminoAcid:Number}.
+
+        The return value is cached in self.amino_acids_content.
+        It is not recalculated upon subsequent calls.
+        """
+        if self.amino_acids_content is None:
+            prot_dic = {k: 0 for k in IUPACData.protein_letters}
+            for aa in prot_dic:
+                prot_dic[aa] = self.sequence.count(aa)
+
+            self.amino_acids_content = prot_dic
+
+        return self.amino_acids_content
+
+    def get_amino_acids_percent(self):
+        """Calculate the amino acid content in percentages.
+
+        The same as count_amino_acids only returns the Number in percentage of
+        entire sequence. Returns a dictionary of {AminoAcid:percentage}.
+
+        The return value is cached in self.amino_acids_percent.
+
+        input is the dictionary self.amino_acids_content.
+        output is a dictionary with amino acids as keys.
+        """
+        if self.amino_acids_percent is None:
+            aa_counts = self.count_amino_acids()
+
+            percentages = {}
+            for aa in aa_counts:
+                percentages[aa] = aa_counts[aa] / float(self.length)
+
+            self.amino_acids_percent = percentages
+
+        return self.amino_acids_percent
+
+    def molecular_weight(self):
+        """Calculate MW from Protein sequence."""
+        return molecular_weight(
+            self.sequence, seq_type="protein", monoisotopic=self.monoisotopic
+        )
+
+    def aromaticity(self):
+        """Calculate the aromaticity according to Lobry, 1994.
+
+        Calculates the aromaticity value of a protein according to Lobry, 1994.
+        It is simply the relative frequency of Phe+Trp+Tyr.
+        """
+        aromatic_aas = "YWF"
+        aa_percentages = self.get_amino_acids_percent()
+
+        aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas)
+
+        return aromaticity
+
+    def instability_index(self):
+        """Calculate the instability index according to Guruprasad et al 1990.
+
+        Implementation of the method of Guruprasad et al. 1990 to test a
+        protein for stability. Any value above 40 means the protein is unstable
+        (has a short half life).
+
+        See: Guruprasad K., Reddy B.V.B., Pandit M.W.
+        Protein Engineering 4:155-161(1990).
+        """
+        index = ProtParamData.DIWV
+        score = 0.0
+
+        for i in range(self.length - 1):
+            this, next = self.sequence[i : i + 2]
+            dipeptide_value = index[this][next]
+            score += dipeptide_value
+
+        return (10.0 / self.length) * score
+
+    def flexibility(self):
+        """Calculate the flexibility according to Vihinen, 1994.
+
+        No argument to change window size because parameters are specific for
+        a window=9. The parameters used are optimized for determining the
+        flexibility.
+        """
+        flexibilities = ProtParamData.Flex
+        window_size = 9
+        weights = [0.25, 0.4375, 0.625, 0.8125, 1]
+        scores = []
+
+        for i in range(self.length - window_size):
+            subsequence = self.sequence[i : i + window_size]
+            score = 0.0
+
+            for j in range(window_size // 2):
+                front = subsequence[j]
+                back = subsequence[window_size - j - 1]
+                score += (flexibilities[front] + flexibilities[back]) * weights[j]
+
+            middle = subsequence[window_size // 2 + 1]
+            score += flexibilities[middle]
+
+            scores.append(score / 5.25)
+
+        return scores
+
+    def gravy(self, scale="KyteDoolitle"):
+        """Calculate the GRAVY (Grand Average of Hydropathy) according to Kyte and Doolitle, 1982.
+
+        Utilizes the given Hydrophobicity scale, by default uses the original
+        proposed by Kyte and Doolittle (KyteDoolitle). Other options are:
+        Aboderin, AbrahamLeo, Argos, BlackMould, BullBreese, Casari, Cid,
+        Cowan3.4, Cowan7.5, Eisenberg, Engelman, Fasman, Fauchere, GoldSack,
+        Guy, Jones, Juretic, Kidera, Miyazawa, Parker,Ponnuswamy, Rose,
+        Roseman, Sweet, Tanford, Wilson and Zimmerman.
+
+        New scales can be added in ProtParamData.
+        """
+        selected_scale = ProtParamData.gravy_scales.get(scale, -1)
+
+        if selected_scale == -1:
+            raise ValueError(f"scale: {scale} not known")
+
+        total_gravy = sum(selected_scale[aa] for aa in self.sequence)
+
+        return total_gravy / self.length
+
+    def _weight_list(self, window, edge):
+        """Make list of relative weight of window edges (PRIVATE).
+
+        The relative weight of window edges are compared to the window
+        center. The weights are linear. It actually generates half a list.
+        For a window of size 9 and edge 0.4 you get a list of
+        [0.4, 0.55, 0.7, 0.85].
+        """
+        unit = 2 * (1.0 - edge) / (window - 1)
+        weights = [0.0] * (window // 2)
+
+        for i in range(window // 2):
+            weights[i] = edge + unit * i
+
+        return weights
+
+    def protein_scale(self, param_dict, window, edge=1.0):
+        """Compute a profile by any amino acid scale.
+
+        An amino acid scale is defined by a numerical value assigned to each
+        type of amino acid. The most frequently used scales are the
+        hydrophobicity or hydrophilicity scales and the secondary structure
+        conformational parameters scales, but many other scales exist which
+        are based on different chemical and physical properties of the
+        amino acids.  You can set several parameters that control the
+        computation of a scale profile, such as the window size and the window
+        edge relative weight value.
+
+        WindowSize: The window size is the length of the interval to use for
+        the profile computation. For a window size n, we use the i-(n-1)/2
+        neighboring residues on each side to compute the score for residue i.
+        The score for residue i is the sum of the scaled values for these
+        amino acids, optionally weighted according to their position in the
+        window.
+
+        Edge: The central amino acid of the window always has a weight of 1.
+        By default, the amino acids at the remaining window positions have the
+        same weight, but you can make the residue at the center of the window
+        have a larger weight than the others by setting the edge value for the
+        residues at the beginning and end of the interval to a value between
+        0 and 1. For instance, for Edge=0.4 and a window size of 5 the weights
+        will be: 0.4, 0.7, 1.0, 0.7, 0.4.
+
+        The method returns a list of values which can be plotted to view the
+        change along a protein sequence.  Many scales exist. Just add your
+        favorites to the ProtParamData modules.
+
+        Similar to expasy's ProtScale:
+        http://www.expasy.org/cgi-bin/protscale.pl
+        """
+        # generate the weights
+        #   _weight_list returns only one tail. If the list should be
+        #   [0.4,0.7,1.0,0.7,0.4] what you actually get from _weights_list
+        #   is [0.4,0.7]. The correct calculation is done in the loop.
+        weights = self._weight_list(window, edge)
+        scores = []
+
+        # the score in each Window is divided by the sum of weights
+        # (* 2 + 1) since the weight list is one sided:
+        sum_of_weights = sum(weights) * 2 + 1
+
+        for i in range(self.length - window + 1):
+            subsequence = self.sequence[i : i + window]
+            score = 0.0
+
+            for j in range(window // 2):
+                # walk from the outside of the Window towards the middle.
+                # Iddo: try/except clauses added to avoid raising an exception
+                # on a non-standard amino acid
+                try:
+                    front = param_dict[subsequence[j]]
+                    back = param_dict[subsequence[window - j - 1]]
+                    score += weights[j] * front + weights[j] * back
+                except KeyError:
+                    sys.stderr.write(
+                        "warning: %s or %s is not a standard "
+                        "amino acid.\n" % (subsequence[j], subsequence[window - j - 1])
+                    )
+
+            # Now add the middle value, which always has a weight of 1.
+            middle = subsequence[window // 2]
+            if middle in param_dict:
+                score += param_dict[middle]
+            else:
+                sys.stderr.write(
+                    "warning: %s  is not a standard amino acid.\n" % middle
+                )
+
+            scores.append(score / sum_of_weights)
+
+        return scores
+
+    def isoelectric_point(self):
+        """Calculate the isoelectric point.
+
+        Uses the module IsoelectricPoint to calculate the pI of a protein.
+        """
+        aa_content = self.count_amino_acids()
+
+        ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
+        return ie_point.pi()
+
+    def charge_at_pH(self, pH):
+        """Calculate the charge of a protein at given pH."""
+        aa_content = self.count_amino_acids()
+        charge = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
+        return charge.charge_at_pH(pH)
+
+    def secondary_structure_fraction(self):
+        """Calculate fraction of helix, turn and sheet.
+
+        Returns a list of the fraction of amino acids which tend
+        to be in Helix, Turn or Sheet.
+
+        Amino acids in helix: V, I, Y, F, W, L.
+        Amino acids in Turn: N, P, G, S.
+        Amino acids in sheet: E, M, A, L.
+
+        Returns a tuple of three floats (Helix, Turn, Sheet).
+        """
+        aa_percentages = self.get_amino_acids_percent()
+
+        helix = sum(aa_percentages[r] for r in "VIYFWL")
+        turn = sum(aa_percentages[r] for r in "NPGS")
+        sheet = sum(aa_percentages[r] for r in "EMAL")
+
+        return helix, turn, sheet
+
+    def molar_extinction_coefficient(self):
+        """Calculate the molar extinction coefficient.
+
+        Calculates the molar extinction coefficient assuming cysteines
+        (reduced) and cystines residues (Cys-Cys-bond)
+        """
+        num_aa = self.count_amino_acids()
+        mec_reduced = num_aa["W"] * 5500 + num_aa["Y"] * 1490
+        mec_cystines = mec_reduced + (num_aa["C"] // 2) * 125
+        return (mec_reduced, mec_cystines)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqUtils/ProtParamData.py b/code/lib/Bio/SeqUtils/ProtParamData.py
new file mode 100644
index 0000000..59f9ce8
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/ProtParamData.py
@@ -0,0 +1,390 @@
+# Copyright 2003 Yair Benita.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Indices to be used with ProtParam."""
+
+# Turn black code style off
+# fmt: off
+
+# Hydrophobicity
+
+# Kyte & Doolittle index of hydrophobicity
+# J. Mol. Biol. 157:105-132(1982).
+# "KyteDoolittle"
+kd = {"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5,
+      "Q": -3.5, "E": -3.5, "G": -0.4, "H": -3.2, "I": 4.5,
+      "L": 3.8, "K": -3.9, "M": 1.9, "F": 2.8, "P": -1.6,
+      "S": -0.8, "T": -0.7, "W": -0.9, "Y": -1.3, "V": 4.2}
+
+# Aboderin hydrophobicity index
+# International J. of Biochemistry, 2(11), 537-544.
+# "Aboderin"
+ab = {"A": 5.1, "R": 2.0, "N": 0.6, "D": 0.7, "C": 0.0,
+      "Q": 1.4, "E": 1.8, "G": 4.1, "H": 1.6, "I": 9.3,
+      "L": 10.0, "K": 1.3, "M": 8.7, "F": 9.6, "P": 4.9,
+      "S": 3.1, "T": 3.5, "W": 9.2, "Y": 8.0, "V": 8.5}
+
+# Abraham & Leo hydrophobicity index
+# Proteins: Structure, Function and Genetics 2:130-152(1987).
+# "AbrahamLeo"
+al = {"A": 0.44, "R": -2.42, "N": -1.32, "D": -0.31, "C": 0.58,
+      "Q": -0.71, "E": -0.34, "G": 0.0, "H": -0.01, "I": 2.46,
+      "L": 2.46, "K": -2.45, "M": 1.1, "F": 2.54, "P": 1.29,
+      "S": -0.84, "T": -0.41, "W": 2.56, "Y": 1.63, "V": 1.73}
+
+# Argos hydrophobicity index
+# European Journal of Biochemistry, 128(2-3), 565-575.
+# "Argos"
+ag = {"A": 0.61, "R": 0.6, "N": 0.06, "D": 0.46, "C": 1.07,
+      "Q": 0.0, "E": 0.47, "G": 0.07, "H": 0.61, "I": 2.22,
+      "L": 1.53, "K": 1.15, "M": 1.18, "F": 2.02, "P": 1.95,
+      "S": 0.05, "T": 0.05, "W": 2.65, "Y": 1.88, "V": 1.32}
+
+# Black & Mould hydrophobicity index
+# Anal. Biochem. 193:72-82(1991).
+# "BlackMould"
+bm = {"A": 0.616, "R": 0.0, "N": 0.236, "D": 0.028, "C": 0.68,
+      "Q": 0.251, "E": 0.043, "G": 0.501, "H": 0.165, "I": 0.943,
+      "L": 0.943, "K": 0.283, "M": 0.738, "F": 1.0, "P": 0.711,
+      "S": 0.359, "T": 0.45, "W": 0.878, "Y": 0.88, "V": 0.825}
+
+# Bull & Breese hydrophobicity index
+# Arch. Biochem. Biophys. 161:665-670(1974)
+# "BullBreese"
+bb = {"A": 0.61, "R": 0.69, "N": 0.89, "D": 0.61, "C": 0.36,
+      "Q": 0.97, "E": 0.51, "G": 0.81, "H": 0.69, "I": -1.45,
+      "L": -1.65, "K": 0.46, "M": -0.66, "F": -1.52, "P": -0.17,
+      "S": 0.42, "T": 0.29, "W": -1.2, "Y": -1.43, "V": -0.75}
+
+# Casari & Sippl hydrophobic potential
+# Journal of molecular biology, 224(3), 725-732.
+# "Casari"
+cs = {"A": 0.2, "R": -0.7, "N": -0.5, "D": -1.4, "C": 1.9,
+      "Q": -1.1, "E": -1.3, "G": -0.1, "H": 0.4, "I": 1.4,
+      "L": 0.5, "K": -1.6, "M": 0.5, "F": 1.0, "P": -1.0,
+      "S": -0.7, "T": -0.4, "W": 1.6, "Y": 0.5, "V": 0.7}
+
+# Cid hydrophobicity index
+# Protein engineering, 5(5), 373-375.
+# "Cid"
+ci = {"A": 0.02, "R": -0.42, "N": -0.77, "D": -1.04, "C": 0.77,
+      "Q": -1.1, "E": -1.14, "G": -0.8, "H": 0.26, "I": 1.81,
+      "L": 1.14, "K": -0.41, "M": 1.0, "F": 1.35, "P": -0.09,
+      "S": -0.97, "T": -0.77, "W": 1.71, "Y": 1.11, "V": 1.13}
+
+# Cowan hydrophobicity indices at ph 3.4 and 7.5
+# Peptide Research 3:75-80(1990).
+# "Cowan3.4" "Conwan7.5"
+cw = {3.4 : {"A": 0.42, "R": -1.56, "N": -1.03, "D": -0.51, "C": 0.84,
+             "Q": -0.96, "E": -0.37, "G": 0.0, "H": -2.28, "I": 1.81,
+             "L": 1.8, "K": -2.03, "M": 1.18, "F": 1.74, "P": 0.86,
+             "S": -0.64, "T": -0.26, "W": 1.46, "Y": 0.51, "V": 1.34},
+      7.5 : {"A": 0.35, "R": -1.5, "N": -0.99, "D": -2.15, "C": 0.76,
+             "Q": -0.93, "E": -1.95, "G": 0.0, "H": -0.65, "I": 1.83,
+             "L": 1.8, "K": -1.54, "M": 1.1, "F": 1.69, "P": 0.84,
+             "S": -0.63, "T": -0.27, "W": 1.35, "Y": 0.39, "V": 1.32}
+      }
+
+# Eisenberg Normalized consensus hydrophobicity scale
+# J. Mol. Biol. 179:125-142(1984)
+# "Eisenberg"
+es = {"A": 0.62, "R": -2.53, "N": -0.78, "D": -0.9, "C": 0.29,
+      "Q": -0.85, "E": -0.74, "G": 0.48, "H": -0.4, "I": 1.38,
+      "L": 1.06, "K": -1.5, "M": 0.64, "F": 1.19, "P": 0.12,
+      "S": -0.18, "T": -0.05, "W": 0.81, "Y": 0.26, "V": 1.08}
+
+# Engelman Hydrophobic Transfer Free Energies
+# Annual review of biophysics and biophysical chemistry, 15(1), 321-353.
+# "Engelman"
+eg = {"A": -1.6, "R": 12.3, "N": 4.8, "D": 9.2, "C": -2,
+      "Q": 4.1, "E": 8.2, "G": -1, "H": 3, "I": -3.1,
+      "L": -2.8, "K": 8.8, "M": -3.4, "F": -3.7, "P": 0.2,
+      "S": -0.6, "T": -1.2, "W": -1.9, "Y": 0.7, "V": -2.6}
+
+# Fasman hydrophobicity index
+# (1989). Prediction of protein structure and the principles of protein conformation. Springer.
+# "Fasman"
+fs = {"A": -0.21, "R": 2.11, "N": 0.96, "D": 1.36, "C": -6.04,
+      "Q": 1.52, "E": 2.3, "G": 0, "H": -1.23, "I": -4.81,
+      "L": -4.68, "K": 3.88, "M": -3.66, "F": -4.65, "P": 0.75,
+      "S": 1.74, "T": 0.78, "W": -3.32, "Y": -1.01, "V": -3.5}
+
+# Fauchere Hydrophobicity scale
+# Eur. J. Med. Chem. 18:369-375(1983).
+# "Fauchere"
+fc = {"A": 0.31, "R": -1.01, "N": -0.6, "D": -0.77, "C": 1.54,
+      "Q": -0.22, "E": -0.64, "G": 0, "H": 0.13, "I": 1.8,
+      "L": 1.7, "K": -0.99, "M": 1.23, "F": 1.79, "P": 0.72,
+      "S": -0.04, "T": 0.26, "W": 2.25, "Y": 0.96, "V": 1.22}
+
+# Goldsack & Chalifoux Free Energy of Mixing of the Hydrophobic Side Chains
+# Journal of theoretical biology, 39(3), 645-651.
+# "Goldsack"
+gd = {"A": 0.75, "R": 0.75, "N": 0.69, "D": 0, "C": 1,
+      "Q": 0.59, "E": 0, "G": 0, "H": 0, "I": 2.95,
+      "L": 2.4, "K": 1.5, "M": 1.3, "F": 2.65, "P": 2.6,
+      "S": 0, "T": 0.45, "W": 3, "Y": 2.85, "V": 1.7}
+
+# Guy Hydrophobicity scale based on free energy of transfer (kcal/mole).
+# Biophys J. 47:61-70(1985)
+# "Guy"
+gy = {"A": 0.1, "R": 1.91, "N": 0.48, "D": 0.78, "C": -1.42,
+      "Q": 0.95, "E": 0.83, "G": 0.33, "H": -0.5, "I": -1.13,
+      "L": -1.18, "K": 1.4, "M": -1.59, "F": -2.12, "P": 0.73,
+      "S": 0.52, "T": 0.07, "W": -0.51, "Y": -0.21, "V": -1.27}
+
+# Jones Hydrophobicity scale
+# Journal of theoretical biology, 50(1), 167-183.
+# "Jones"
+jo = {"A": 0.87, "R": 0.85, "N": 0.09, "D": 0.66, "C": 1.52,
+      "Q": 0, "E": 0.67, "G": 0.1, "H": 0.87, "I": 3.15,
+      "L": 2.17, "K": 1.64, "M": 1.67, "F": 2.87, "P": 2.77,
+      "S": 0.07, "T": 0.07, "W": 3.77, "Y": 2.67, "V": 1.87}
+
+# Juretic Hydrophobicity scale
+# Theoretical and computational chemistry, 5, 405-445.
+# "Juretic"
+ju = {"A": 1.1, "R": -5.1, "N": -3.5, "D": -3.6, "C": 2.5,
+      "Q": -3.68, "E": -3.2, "G": -0.64, "H": -3.2, "I": 4.5,
+      "L": 3.8, "K": -4.11, "M": 1.9, "F": 2.8, "P": -1.9,
+      "S": -0.5, "T": -0.7, "W": -0.46, "Y": -1.3, "V": 4.2}
+
+# Kidera Hydrophobicity Factors
+# Journal of Protein Chemistry, 4(1), 23-55.
+# "Kidera"
+ki = {"A": -0.27, "R": 1.87, "N": 0.81, "D": 0.81, "C": -1.05,
+      "Q": 1.1, "E": 1.17, "G": -0.16, "H": 0.28, "I": -0.77,
+      "L": -1.1, "K": 1.7, "M": -0.73, "F": -1.43, "P": -0.75,
+      "S": 0.42, "T": 0.63, "W": -1.57, "Y": -0.56, "V": -0.4}
+
+# Miyazawa Hydrophobicity scale (contact energy derived from 3D data)
+# Macromolecules 18:534-552(1985)
+# "Miyazawa"
+mi = {"A": 5.33, "R": 4.18, "N": 3.71, "D": 3.59, "C": 7.93,
+      "Q": 3.87, "E": 3.65, "G": 4.48, "H": 5.1, "I": 8.83,
+      "L": 8.47, "K": 2.95, "M": 8.95, "F": 9.03, "P": 3.87,
+      "S": 4.09, "T": 4.49, "W": 7.66, "Y": 5.89, "V": 7.63}
+
+# Parker Hydrophilicity scale derived from HPLC peptide retention times
+# Biochemistry 25:5425-5431(1986)
+# "Parker"
+pa = {"A": 2.1, "R": 4.2, "N": 7, "D": 10, "C": 1.4,
+      "Q": 6, "E": 7.8, "G": 5.7, "H": 2.1, "I": -8,
+      "L": -9.2, "K": 5.7, "M": -4.2, "F": -9.2, "P": 2.1,
+      "S": 6.5, "T": 5.2, "W": -10, "Y": -1.9, "V": -3.7}
+
+# Ponnuswamy Hydrophobic characteristics of folded proteins
+# Progress in biophysics and molecular biology, 59(1), 57-103.
+# "Ponnuswamy"
+po = {"A": 0.85, "R": 0.2, "N": -0.48, "D": -1.1, "C": 2.1,
+      "Q": -0.42, "E": -0.79, "G": 0, "H": 0.22, "I": 3.14,
+      "L": 1.99, "K": -1.19, "M": 1.42, "F": 1.69, "P": -1.14,
+      "S": -0.52, "T": -0.08, "W": 1.76, "Y": 1.37, "V": 2.53}
+
+# Rose Hydrophobicity scale
+# Science 229:834-838(1985)
+# "Rose"
+ro = {"A": 0.74, "R": 0.64, "N": 0.63, "D": 0.62, "C": 0.91,
+      "Q": 0.62, "E": 0.62, "G": 0.72, "H": 0.78, "I": 0.88,
+      "L": 0.85, "K": 0.52, "M": 0.85, "F": 0.88, "P": 0.64,
+      "S": 0.66, "T": 0.7, "W": 0.85, "Y": 0.76, "V": 0.86}
+
+# Roseman Hydrophobicity scale
+# J. Mol. Biol. 200:513-522(1988)
+# "Roseman"
+rm = {"A": 0.39, "R": -3.95, "N": -1.91, "D": -3.81, "C": 0.25,
+      "Q": -1.3, "E": -2.91, "G": 0, "H": -0.64, "I": 1.82,
+      "L": 1.82, "K": -2.77, "M": 0.96, "F": 2.27, "P": 0.99,
+      "S": -1.24, "T": -1, "W": 2.13, "Y": 1.47, "V": 1.3}
+
+# Sweet Optimized Matchig Hydrophobicity (OMH)
+# J. Mol. Biol. 171:479-488(1983).
+# "Sweet
+sw = {"A": -0.4, "R": -0.59, "N": -0.92, "D": -1.31, "C": 0.17,
+      "Q": -0.91, "E": -1.22, "G": -0.67, "H": -0.64, "I": 1.25,
+      "L": 1.22, "K": -0.67, "M": 1.02, "F": 1.92, "P": -0.49,
+      "S": -0.55, "T": -0.28, "W": 0.5, "Y": 1.67, "V": 0.91}
+
+# Tanford Hydrophobicity scale
+# J. Am. Chem. Soc. 84:4240-4274(1962)
+# "Tanford"
+ta = {"A": 0.62, "R": -2.53, "N": -0.78, "D": -0.09, "C": 0.29,
+      "Q": -0.85, "E": -0.74, "G": 0.48, "H": -0.4, "I": 1.38,
+      "L": 1.53, "K": -1.5, "M": 0.64, "F": 1.19, "P": 0.12,
+      "S": -0.18, "T": -0.05, "W": 0.81, "Y": 0.26, "V": 1.8}
+
+# Wilson Hydrophobic constants derived from HPLC peptide retention times
+# Biochem. J. 199:31-41(1981)
+# "Wilson"
+wi = {"A": -0.3, "R": -1.1, "N": -0.2, "D": -1.4, "C": 6.3,
+      "Q": -0.2, "E": 0, "G": 1.2, "H": -1.3, "I": 4.3,
+      "L": 6.6, "K": -3.6, "M": 2.5, "F": 7.5, "P": 2.2,
+      "S": -0.6, "T": -2.2, "W": 7.9, "Y": 7.1, "V": 5.9}
+
+# Zimmerman Hydrophobicity scale
+# Journal of theoretical biology, 21(2), 170-201.
+# "Zimmerman"
+zi = {"A": 0.83, "R": 0.83, "N": 0.09, "D": 0.64, "C": 1.48,
+      "Q": 0, "E": 0.65, "G": 0.1, "H": 1.1, "I": 3.07,
+      "L": 2.52, "K": 1.6, "M": 1.4, "F": 2.75, "P": 2.7,
+      "S": 0.14, "T": 0.54, "W": 0.31, "Y": 2.97, "V": 1.79}
+
+gravy_scales = {"KyteDoolitle": kd, "Aboderin": ab,
+                "AbrahamLeo": al, "Argos": ag,
+                "BlackMould": bm, "BullBreese": bb,
+                "Casari": cs, "Cid": ci,
+                "Cowan3.4": cw[3.4], "Cowan7.5": cw[7.5],
+                "Eisenberg": es, "Engelman": eg,
+                "Fasman": fs, "Fauchere": fc,
+                "GoldSack": gd, "Guy": gy,
+                "Jones": jo, "Juretic": ju,
+                "Kidera": ki, "Miyazawa": mi,
+                "Parker": pa, "Ponnuswamy": po,
+                "Rose": ro, "Roseman": rm,
+                "Sweet": sw, "Tanford": ta,
+                "Wilson": wi, "Zimmerman": zi}
+
+
+# Flexibility
+# Normalized flexibility parameters (B-values), average
+# Vihinen M., Torkkila E., Riikonen P. Proteins. 19(2):141-9(1994).
+Flex = {"A": 0.984, "C": 0.906, "E": 1.094, "D": 1.068,
+        "G": 1.031, "F": 0.915, "I": 0.927, "H": 0.950,
+        "K": 1.102, "M": 0.952, "L": 0.935, "N": 1.048,
+        "Q": 1.037, "P": 1.049, "S": 1.046, "R": 1.008,
+        "T": 0.997, "W": 0.904, "V": 0.931, "Y": 0.929}
+
+# Hydrophilicity
+# 1 Hopp & Wood
+# Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981).
+hw = {"A": -0.5, "R": 3.0, "N": 0.2, "D": 3.0, "C": -1.0,
+      "Q": 0.2, "E": 3.0, "G": 0.0, "H": -0.5, "I": -1.8,
+      "L": -1.8, "K": 3.0, "M": -1.3, "F": -2.5, "P": 0.0,
+      "S": 0.3, "T": -0.4, "W": -3.4, "Y": -2.3, "V": -1.5}
+
+# Surface accessibility
+# Vergoten G & Theophanides T, Biomolecular Structure and Dynamics,
+# pg.138 (1997).
+# 1 Emini Surface fractional probability
+em = {"A": 0.815, "R": 1.475, "N": 1.296, "D": 1.283, "C": 0.394,
+      "Q": 1.348, "E": 1.445, "G": 0.714, "H": 1.180, "I": 0.603,
+      "L": 0.603, "K": 1.545, "M": 0.714, "F": 0.695, "P": 1.236,
+      "S": 1.115, "T": 1.184, "W": 0.808, "Y": 1.089, "V": 0.606}
+
+# 2 Janin Interior to surface transfer energy scale
+ja = {"A": 0.28, "R": -1.14, "N": -0.55, "D": -0.52, "C": 0.97,
+      "Q": -0.69, "E": -1.01, "G": 0.43, "H": -0.31, "I": 0.60,
+      "L": 0.60, "K": -1.62, "M": 0.43, "F": 0.46, "P": -0.42,
+      "S": -0.19, "T": -0.32, "W": 0.29, "Y": -0.15, "V": 0.60}
+
+
+# A two dimensional dictionary for calculating the instability index.
+# Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990).
+# It is based on dipeptide values; therefore, the value for the dipeptide DG
+# is DIWV['D']['G'].
+DIWV = {"A": {"A": 1.0, "C": 44.94, "E": 1.0, "D": -7.49,
+              "G": 1.0, "F": 1.0, "I": 1.0, "H": -7.49,
+              "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0,
+              "T": 1.0, "W": 1.0, "V": 1.0, "Y": 1.0},
+        "C": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 20.26,
+              "G": 1.0, "F": 1.0, "I": 1.0, "H": 33.60,
+              "K": 1.0, "M": 33.60, "L": 20.26, "N": 1.0,
+              "Q": -6.54, "P": 20.26, "S": 1.0, "R": 1.0,
+              "T": 33.60, "W": 24.68, "V": -6.54, "Y": 1.0},
+        "E": {"A": 1.0, "C": 44.94, "E": 33.60, "D": 20.26,
+              "G": 1.0, "F": 1.0, "I": 20.26, "H": -6.54,
+              "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": 20.26, "P": 20.26, "S": 20.26, "R": 1.0,
+              "T": 1.0, "W": -14.03, "V": 1.0, "Y": 1.0},
+        "D": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
+              "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0,
+              "K": -7.49, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": 1.0, "P": 1.0, "S": 20.26, "R": -6.54,
+              "T": -14.03, "W": 1.0, "V": 1.0, "Y": 1.0},
+        "G": {"A": -7.49, "C": 1.0, "E": -6.54, "D": 1.0,
+              "G": 13.34, "F": 1.0, "I": -7.49, "H": 1.0,
+              "K": -7.49, "M": 1.0, "L": 1.0, "N": -7.49,
+              "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0,
+              "T": -7.49, "W": 13.34, "V": 1.0, "Y": -7.49},
+        "F": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 13.34,
+              "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0,
+              "K": -14.03, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0,
+              "T": 1.0, "W": 1.0, "V": 1.0, "Y": 33.601},
+        "I": {"A": 1.0, "C": 1.0, "E": 44.94, "D": 1.0,
+              "G": 1.0, "F": 1.0, "I": 1.0, "H": 13.34,
+              "K": -7.49, "M": 1.0, "L": 20.26, "N": 1.0,
+              "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0,
+              "T": 1.0, "W": 1.0, "V": -7.49, "Y": 1.0},
+        "H": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
+              "G": -9.37, "F": -9.37, "I": 44.94, "H": 1.0,
+              "K": 24.68, "M": 1.0, "L": 1.0, "N": 24.68,
+              "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0,
+              "T": -6.54, "W": -1.88, "V": 1.0, "Y": 44.94},
+        "K": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
+              "G": -7.49, "F": 1.0, "I": -7.49, "H": 1.0,
+              "K": 1.0, "M": 33.60, "L": -7.49, "N": 1.0,
+              "Q": 24.64, "P": -6.54, "S": 1.0, "R": 33.60,
+              "T": 1.0, "W": 1.0, "V": -7.49, "Y": 1.0},
+        "M": {"A": 13.34, "C": 1.0, "E": 1.0, "D": 1.0,
+              "G": 1.0, "F": 1.0, "I": 1.0, "H": 58.28,
+              "K": 1.0, "M": -1.88, "L": 1.0, "N": 1.0,
+              "Q": -6.54, "P": 44.94, "S": 44.94, "R": -6.54,
+              "T": -1.88, "W": 1.0, "V": 1.0, "Y": 24.68},
+        "L": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
+              "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0,
+              "K": -7.49, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": 33.60, "P": 20.26, "S": 1.0, "R": 20.26,
+              "T": 1.0, "W": 24.68, "V": 1.0, "Y": 1.0},
+        "N": {"A": 1.0, "C": -1.88, "E": 1.0, "D": 1.0,
+              "G": -14.03, "F": -14.03, "I": 44.94, "H": 1.0,
+              "K": 24.68, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": -6.54, "P": -1.88, "S": 1.0, "R": 1.0,
+              "T": -7.49, "W": -9.37, "V": 1.0, "Y": 1.0},
+        "Q": {"A": 1.0, "C": -6.54, "E": 20.26, "D": 20.26,
+              "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0,
+              "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": 20.26, "P": 20.26, "S": 44.94, "R": 1.0,
+              "T": 1.0, "W": 1.0, "V": -6.54, "Y": -6.54},
+        "P": {"A": 20.26, "C": -6.54, "E": 18.38, "D": -6.54,
+              "G": 1.0, "F": 20.26, "I": 1.0, "H": 1.0,
+              "K": 1.0, "M": -6.54, "L": 1.0, "N": 1.0,
+              "Q": 20.26, "P": 20.26, "S": 20.26, "R": -6.54,
+              "T": 1.0, "W": -1.88, "V": 20.26, "Y": 1.0},
+        "S": {"A": 1.0, "C": 33.60, "E": 20.26, "D": 1.0,
+              "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0,
+              "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": 20.26, "P": 44.94, "S": 20.26, "R": 20.26,
+              "T": 1.0, "W": 1.0, "V": 1.0, "Y": 1.0},
+        "R": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
+              "G": -7.49, "F": 1.0, "I": 1.0, "H": 20.26,
+              "K": 1.0, "M": 1.0, "L": 1.0, "N": 13.34,
+              "Q": 20.26, "P": 20.26, "S": 44.94, "R": 58.28,
+              "T": 1.0, "W": 58.28, "V": 1.0, "Y": -6.54},
+        "T": {"A": 1.0, "C": 1.0, "E": 20.26, "D": 1.0,
+              "G": -7.49, "F": 13.34, "I": 1.0, "H": 1.0,
+              "K": 1.0, "M": 1.0, "L": 1.0, "N": -14.03,
+              "Q": -6.54, "P": 1.0, "S": 1.0, "R": 1.0,
+              "T": 1.0, "W": -14.03, "V": 1.0, "Y": 1.0},
+        "W": {"A": -14.03, "C": 1.0, "E": 1.0, "D": 1.0,
+              "G": -9.37, "F": 1.0, "I": 1.0, "H": 24.68,
+              "K": 1.0, "M": 24.68, "L": 13.34, "N": 13.34,
+              "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0,
+              "T": -14.03, "W": 1.0, "V": -7.49, "Y": 1.0},
+        "V": {"A": 1.0, "C": 1.0, "E": 1.0, "D": -14.03,
+              "G": -7.49, "F": 1.0, "I": 1.0, "H": 1.0,
+              "K": -1.88, "M": 1.0, "L": 1.0, "N": 1.0,
+              "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0,
+              "T": -7.49, "W": 1.0, "V": 1.0, "Y": -6.54},
+        "Y": {"A": 24.68, "C": 1.0, "E": -6.54, "D": 24.68,
+              "G": -7.49, "F": 1.0, "I": 1.0, "H": 13.34,
+              "K": 1.0, "M": 44.94, "L": 1.0, "N": 1.0,
+              "Q": 1.0, "P": 13.34, "S": 1.0, "R": -15.91,
+              "T": -7.49, "W": -9.37, "V": 1.0, "Y": 13.34},
+        }
+
+# Turn black code style on
+# fmt: on
diff --git a/code/lib/Bio/SeqUtils/__init__.py b/code/lib/Bio/SeqUtils/__init__.py
new file mode 100644
index 0000000..9ea58d8
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/__init__.py
@@ -0,0 +1,467 @@
+#!/usr/bin/env python
+# Copyright 2002 by Thomas Sicheritz-Ponten and Cecilia Alsmark.
+# Revisions copyright 2014 by Markus Piotrowski.
+# Revisions copyright 2014-2016 by Peter Cock.
+# All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Miscellaneous functions for dealing with sequences."""
+
+
+import re
+from math import pi, sin, cos
+
+from Bio.Seq import Seq
+from Bio.Data import IUPACData
+
+
+######################################
+# DNA
+######################
+# {{{
+
+
+def GC(seq):
+    """Calculate G+C content, return percentage (as float between 0 and 100).
+
+    Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
+    when counting the G and C content.  The percentage is calculated against
+    the full length, e.g.:
+
+    >>> from Bio.SeqUtils import GC
+    >>> GC("ACTGN")
+    40.0
+
+    Note that this will return zero for an empty sequence.
+    """
+    gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
+    try:
+        return gc * 100.0 / len(seq)
+    except ZeroDivisionError:
+        return 0.0
+
+
+def GC123(seq):
+    """Calculate G+C content: total, for first, second and third positions.
+
+    Returns a tuple of four floats (percentages between 0 and 100) for the
+    entire sequence, and the three codon positions.  e.g.
+
+    >>> from Bio.SeqUtils import GC123
+    >>> GC123("ACTGTN")
+    (40.0, 50.0, 50.0, 0.0)
+
+    Copes with mixed case sequences, but does NOT deal with ambiguous
+    nucleotides.
+    """
+    d = {}
+    for nt in ["A", "T", "G", "C"]:
+        d[nt] = [0, 0, 0]
+
+    for i in range(0, len(seq), 3):
+        codon = seq[i : i + 3]
+        if len(codon) < 3:
+            codon += "  "
+        for pos in range(0, 3):
+            for nt in ["A", "T", "G", "C"]:
+                if codon[pos] == nt or codon[pos] == nt.lower():
+                    d[nt][pos] += 1
+    gc = {}
+    gcall = 0
+    nall = 0
+    for i in range(0, 3):
+        try:
+            n = d["G"][i] + d["C"][i] + d["T"][i] + d["A"][i]
+            gc[i] = (d["G"][i] + d["C"][i]) * 100.0 / n
+        except Exception:  # TODO - ValueError?
+            gc[i] = 0
+
+        gcall = gcall + d["G"][i] + d["C"][i]
+        nall = nall + n
+
+    gcall = 100.0 * gcall / nall
+    return gcall, gc[0], gc[1], gc[2]
+
+
+def GC_skew(seq, window=100):
+    """Calculate GC skew (G-C)/(G+C) for multiple windows along the sequence.
+
+    Returns a list of ratios (floats), controlled by the length of the sequence
+    and the size of the window.
+
+    Returns 0 for windows without any G/C by handling zero division errors.
+
+    Does NOT look at any ambiguous nucleotides.
+    """
+    # 8/19/03: Iddo: added lowercase
+    values = []
+    for i in range(0, len(seq), window):
+        s = seq[i : i + window]
+        g = s.count("G") + s.count("g")
+        c = s.count("C") + s.count("c")
+        try:
+            skew = (g - c) / float(g + c)
+        except ZeroDivisionError:
+            skew = 0.0
+        values.append(skew)
+    return values
+
+
+def xGC_skew(seq, window=1000, zoom=100, r=300, px=100, py=100):
+    """Calculate and plot normal and accumulated GC skew (GRAPHICS !!!)."""
+    import tkinter
+
+    yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL)
+    xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL)
+    canvas = tkinter.Canvas(
+        yscrollcommand=yscroll.set, xscrollcommand=xscroll.set, background="white"
+    )
+    win = canvas.winfo_toplevel()
+    win.geometry("700x700")
+
+    yscroll.config(command=canvas.yview)
+    xscroll.config(command=canvas.xview)
+    yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y)
+    xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X)
+    canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1)
+    canvas.update()
+
+    X0, Y0 = r + px, r + py
+    x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r
+
+    ty = Y0
+    canvas.create_text(X0, ty, text="%s...%s (%d nt)" % (seq[:7], seq[-7:], len(seq)))
+    ty += 20
+    canvas.create_text(X0, ty, text="GC %3.2f%%" % (GC(seq)))
+    ty += 20
+    canvas.create_text(X0, ty, text="GC Skew", fill="blue")
+    ty += 20
+    canvas.create_text(X0, ty, text="Accumulated GC Skew", fill="magenta")
+    ty += 20
+    canvas.create_oval(x1, y1, x2, y2)
+
+    acc = 0
+    start = 0
+    for gc in GC_skew(seq, window):
+        r1 = r
+        acc += gc
+        # GC skew
+        alpha = pi - (2 * pi * start) / len(seq)
+        r2 = r1 - gc * zoom
+        x1 = X0 + r1 * sin(alpha)
+        y1 = Y0 + r1 * cos(alpha)
+        x2 = X0 + r2 * sin(alpha)
+        y2 = Y0 + r2 * cos(alpha)
+        canvas.create_line(x1, y1, x2, y2, fill="blue")
+        # accumulated GC skew
+        r1 = r - 50
+        r2 = r1 - acc
+        x1 = X0 + r1 * sin(alpha)
+        y1 = Y0 + r1 * cos(alpha)
+        x2 = X0 + r2 * sin(alpha)
+        y2 = Y0 + r2 * cos(alpha)
+        canvas.create_line(x1, y1, x2, y2, fill="magenta")
+
+        canvas.update()
+        start += window
+
+    canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
+
+
+def nt_search(seq, subseq):
+    """Search for a DNA subseq in sequence, return list of [subseq, positions].
+
+    Use ambiguous values (like N = A or T or C or G, R = A or G etc.),
+    searches only on forward strand.
+    """
+    pattern = ""
+    for nt in subseq:
+        value = IUPACData.ambiguous_dna_values[nt]
+        if len(value) == 1:
+            pattern += value
+        else:
+            pattern += "[%s]" % value
+
+    pos = -1
+    result = [pattern]
+    while True:
+        pos += 1
+        s = seq[pos:]
+        m = re.search(pattern, s)
+        if not m:
+            break
+        pos += int(m.start(0))
+        result.append(pos)
+    return result
+
+
+######################################
+# Protein
+######################
+
+
+def seq3(seq, custom_map=None, undef_code="Xaa"):
+    """Convert protein sequence from one-letter to three-letter code.
+
+    The single required input argument 'seq' should be a protein sequence using
+    single letter codes, either as a Python string or as a Seq or MutableSeq
+    object.
+
+    This function returns the amino acid sequence as a string using the three
+    letter amino acid codes. Output follows the IUPAC standard (including
+    ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
+    for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an
+    asterisk. Any unknown character (including possible gap characters),
+    is changed into 'Xaa' by default.
+
+    e.g.
+
+    >>> from Bio.SeqUtils import seq3
+    >>> seq3("MAIVMGRWKGAR*")
+    'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
+
+    You can set a custom translation of the codon termination code using the
+    dictionary "custom_map" argument (which defaults to {'*': 'Ter'}), e.g.
+
+    >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"})
+    'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***'
+
+    You can also set a custom translation for non-amino acid characters, such
+    as '-', using the "undef_code" argument, e.g.
+
+    >>> seq3("MAIVMGRWKGA--R*", undef_code='---')
+    'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer'
+
+    If not given, "undef_code" defaults to "Xaa", e.g.
+
+    >>> seq3("MAIVMGRWKGA--R*")
+    'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer'
+
+    This function was inspired by BioPerl's seq3.
+    """
+    if custom_map is None:
+        custom_map = {"*": "Ter"}
+    # not doing .update() on IUPACData dict with custom_map dict
+    # to preserve its initial state (may be imported in other modules)
+    threecode = dict(
+        list(IUPACData.protein_letters_1to3_extended.items()) + list(custom_map.items())
+    )
+    # We use a default of 'Xaa' for undefined letters
+    # Note this will map '-' to 'Xaa' which may be undesirable!
+    return "".join(threecode.get(aa, undef_code) for aa in seq)
+
+
+def seq1(seq, custom_map=None, undef_code="X"):
+    """Convert protein sequence from three-letter to one-letter code.
+
+    The single required input argument 'seq' should be a protein sequence
+    using three-letter codes, either as a Python string or as a Seq or
+    MutableSeq object.
+
+    This function returns the amino acid sequence as a string using the one
+    letter amino acid codes. Output follows the IUPAC standard (including
+    ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for
+    "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code.
+    Any unknown character (including possible gap characters), is changed
+    into '-' by default.
+
+    e.g.
+
+    >>> from Bio.SeqUtils import seq1
+    >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer")
+    'MAIVMGRWKGAR*'
+
+    The input is case insensitive, e.g.
+
+    >>> from Bio.SeqUtils import seq1
+    >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer")
+    'MAIVMGRWKGAR*'
+
+    You can set a custom translation of the codon termination code using the
+    dictionary "custom_map" argument (defaulting to {'Ter': '*'}), e.g.
+
+    >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla***", custom_map={"***": "*"})
+    'MAIVMGRWKGA*'
+
+    You can also set a custom translation for non-amino acid characters, such
+    as '-', using the "undef_code" argument, e.g.
+
+    >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?')
+    'MAIVMGRWKGA??R*'
+
+    If not given, "undef_code" defaults to "X", e.g.
+
+    >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer")
+    'MAIVMGRWKGAXXR*'
+
+    """
+    if custom_map is None:
+        custom_map = {"Ter": "*"}
+    # reverse map of threecode
+    # upper() on all keys to enable caps-insensitive input seq handling
+    onecode = {k.upper(): v for k, v in IUPACData.protein_letters_3to1_extended.items()}
+    # add the given termination codon code and custom maps
+    onecode.update((k.upper(), v) for k, v in custom_map.items())
+    seqlist = [seq[3 * i : 3 * (i + 1)] for i in range(len(seq) // 3)]
+    return "".join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
+
+
+######################################
+# Mixed ???
+######################
+
+
+def molecular_weight(
+    seq, seq_type="DNA", double_stranded=False, circular=False, monoisotopic=False
+):
+    """Calculate the molecular mass of DNA, RNA or protein sequences as float.
+
+    Only unambiguous letters are allowed. Nucleotide sequences are assumed to
+    have a 5' phosphate.
+
+    Arguments:
+     - seq: String or Biopython sequence object.
+     - seq_type: The default is to assume DNA; override this with a string
+       "DNA", "RNA", or "protein".
+     - double_stranded: Calculate the mass for the double stranded molecule?
+     - circular: Is the molecule circular (has no ends)?
+     - monoisotopic: Use the monoisotopic mass tables?
+
+    >>> print("%0.2f" % molecular_weight("AGC"))
+    949.61
+    >>> print("%0.2f" % molecular_weight(Seq("AGC")))
+    949.61
+
+    However, it is better to be explicit - for example with strings:
+
+    >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
+    949.61
+    >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
+    997.61
+    >>> print("%0.2f" % molecular_weight("AGC", "protein"))
+    249.29
+
+    """
+    # Rewritten by Markus Piotrowski, 2014
+
+    seq = "".join(str(seq).split()).upper()  # Do the minimum formatting
+
+    if seq_type == "DNA":
+        if monoisotopic:
+            weight_table = IUPACData.monoisotopic_unambiguous_dna_weights
+        else:
+            weight_table = IUPACData.unambiguous_dna_weights
+    elif seq_type == "RNA":
+        if monoisotopic:
+            weight_table = IUPACData.monoisotopic_unambiguous_rna_weights
+        else:
+            weight_table = IUPACData.unambiguous_rna_weights
+    elif seq_type == "protein":
+        if monoisotopic:
+            weight_table = IUPACData.monoisotopic_protein_weights
+        else:
+            weight_table = IUPACData.protein_weights
+    else:
+        raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" % seq_type)
+
+    if monoisotopic:
+        water = 18.010565
+    else:
+        water = 18.0153
+
+    try:
+        weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water
+        if circular:
+            weight -= water
+    except KeyError as e:
+        raise ValueError(
+            "%s is not a valid unambiguous letter for %s" % (e, seq_type)
+        ) from None
+
+    if seq_type in ("DNA", "RNA") and double_stranded:
+        seq = str(Seq(seq).complement())
+        weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water
+        if circular:
+            weight -= water
+    elif seq_type == "protein" and double_stranded:
+        raise ValueError("double-stranded proteins await their discovery")
+
+    return weight
+
+
+def six_frame_translations(seq, genetic_code=1):
+    """Return pretty string showing the 6 frame translations and GC content.
+
+    Nice looking 6 frame translation with GC content - code from xbbtools
+    similar to DNA Striders six-frame translation
+
+    >>> from Bio.SeqUtils import six_frame_translations
+    >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA"))
+    GC_Frame: a:5 t:0 g:8 c:5 
+    Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC
+    
+    
+    1/1
+      G  H  C  N  G  P  L
+     W  P  L  *  W  A  A
+    M  A  I  V  M  G  R  *
+    auggccauuguaaugggccgcuga   54 %
+    uaccgguaacauuacccggcgacu
+    A  M  T  I  P  R  Q 
+     H  G  N  Y  H  A  A  S
+      P  W  Q  L  P  G  S
+    
+    
+
+    """  # noqa for pep8 W291 trailing whitespace
+    from Bio.Seq import reverse_complement, translate
+
+    anti = reverse_complement(seq)
+    comp = anti[::-1]
+    length = len(seq)
+    frames = {}
+    for i in range(0, 3):
+        fragment_length = 3 * ((length - i) // 3)
+        frames[i + 1] = translate(seq[i : i + fragment_length], genetic_code)
+        frames[-(i + 1)] = translate(anti[i : i + fragment_length], genetic_code)[::-1]
+
+    # create header
+    if length > 20:
+        short = "%s ... %s" % (seq[:10], seq[-10:])
+    else:
+        short = seq
+    header = "GC_Frame: "
+    for nt in ["a", "t", "g", "c"]:
+        header += "%s:%d " % (nt, seq.count(nt.upper()))
+
+    header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % (
+        short.lower(),
+        length,
+        GC(seq),
+    )
+    res = header
+
+    for i in range(0, length, 60):
+        subseq = seq[i : i + 60]
+        csubseq = comp[i : i + 60]
+        p = i // 3
+        res += "%d/%d\n" % (i + 1, i / 3 + 1)
+        res += "  " + "  ".join(frames[3][p : p + 20]) + "\n"
+        res += " " + "  ".join(frames[2][p : p + 20]) + "\n"
+        res += "  ".join(frames[1][p : p + 20]) + "\n"
+        # seq
+        res += subseq.lower() + "%5d %%\n" % int(GC(subseq))
+        res += csubseq.lower() + "\n"
+        # - frames
+        res += "  ".join(frames[-2][p : p + 20]) + " \n"
+        res += " " + "  ".join(frames[-1][p : p + 20]) + "\n"
+        res += "  " + "  ".join(frames[-3][p : p + 20]) + "\n\n"
+    return res
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/SeqUtils/__pycache__/CheckSum.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/CheckSum.cpython-37.pyc
new file mode 100644
index 0000000..40884cc
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/CheckSum.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/__pycache__/CodonUsage.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/CodonUsage.cpython-37.pyc
new file mode 100644
index 0000000..f7c7c32
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/CodonUsage.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/__pycache__/CodonUsageIndices.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/CodonUsageIndices.cpython-37.pyc
new file mode 100644
index 0000000..c183b23
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/CodonUsageIndices.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/__pycache__/IsoelectricPoint.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/IsoelectricPoint.cpython-37.pyc
new file mode 100644
index 0000000..fe9eeea
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/IsoelectricPoint.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/__pycache__/MeltingTemp.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/MeltingTemp.cpython-37.pyc
new file mode 100644
index 0000000..ed9a06b
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/MeltingTemp.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/__pycache__/ProtParam.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/ProtParam.cpython-37.pyc
new file mode 100644
index 0000000..ae29e36
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/ProtParam.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/__pycache__/ProtParamData.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/ProtParamData.cpython-37.pyc
new file mode 100644
index 0000000..b2e06bc
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/ProtParamData.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..5e08256
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/__pycache__/lcc.cpython-37.pyc b/code/lib/Bio/SeqUtils/__pycache__/lcc.cpython-37.pyc
new file mode 100644
index 0000000..6e95c51
Binary files /dev/null and b/code/lib/Bio/SeqUtils/__pycache__/lcc.cpython-37.pyc differ
diff --git a/code/lib/Bio/SeqUtils/lcc.py b/code/lib/Bio/SeqUtils/lcc.py
new file mode 100644
index 0000000..dd2a976
--- /dev/null
+++ b/code/lib/Bio/SeqUtils/lcc.py
@@ -0,0 +1,162 @@
+# Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Local Composition Complexity."""
+
+import math
+
+
+def lcc_mult(seq, wsize):
+    """Calculate Local Composition Complexity (LCC) values over sliding window.
+
+    Returns a list of floats, the LCC values for a sliding window over
+    the sequence.
+
+    seq - an unambiguous DNA sequence (a string or Seq object)
+    wsize - window size, integer
+
+    The result is the same as applying lcc_simp multiple times, but this
+    version is optimized for speed. The optimization works by using the
+    value of previous window as a base to compute the next one.
+    """
+    l2 = math.log(2)
+    tamseq = len(seq)
+    upper = str(seq).upper()
+    compone = [0]
+    lccsal = [0]
+    for i in range(wsize):
+        compone.append(
+            ((i + 1) / float(wsize)) * ((math.log((i + 1) / float(wsize))) / l2)
+        )
+    window = seq[0:wsize]
+    cant_a = window.count("A")
+    cant_c = window.count("C")
+    cant_t = window.count("T")
+    cant_g = window.count("G")
+    term_a = compone[cant_a]
+    term_c = compone[cant_c]
+    term_t = compone[cant_t]
+    term_g = compone[cant_g]
+    lccsal.append(-(term_a + term_c + term_t + term_g))
+    tail = seq[0]
+    for x in range(tamseq - wsize):
+        window = upper[x + 1 : wsize + x + 1]
+        if tail == window[-1]:
+            lccsal.append(lccsal[-1])
+        elif tail == "A":
+            cant_a -= 1
+            if window.endswith("C"):
+                cant_c += 1
+                term_a = compone[cant_a]
+                term_c = compone[cant_c]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+            elif window.endswith("T"):
+                cant_t += 1
+                term_a = compone[cant_a]
+                term_t = compone[cant_t]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+            elif window.endswith("G"):
+                cant_g += 1
+                term_a = compone[cant_a]
+                term_g = compone[cant_g]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+        elif tail == "C":
+            cant_c -= 1
+            if window.endswith("A"):
+                cant_a += 1
+                term_a = compone[cant_a]
+                term_c = compone[cant_c]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+            elif window.endswith("T"):
+                cant_t += 1
+                term_c = compone[cant_c]
+                term_t = compone[cant_t]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+            elif window.endswith("G"):
+                cant_g += 1
+                term_c = compone[cant_c]
+                term_g = compone[cant_g]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+        elif tail == "T":
+            cant_t -= 1
+            if window.endswith("A"):
+                cant_a += 1
+                term_a = compone[cant_a]
+                term_t = compone[cant_t]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+            elif window.endswith("C"):
+                cant_c += 1
+                term_c = compone[cant_c]
+                term_t = compone[cant_t]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+            elif window.endswith("G"):
+                cant_g += 1
+                term_t = compone[cant_t]
+                term_g = compone[cant_g]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+        elif tail == "G":
+            cant_g -= 1
+            if window.endswith("A"):
+                cant_a += 1
+                term_a = compone[cant_a]
+                term_g = compone[cant_g]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+            elif window.endswith("C"):
+                cant_c += 1
+                term_c = compone[cant_c]
+                term_g = compone[cant_g]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+            elif window.endswith("T"):
+                cant_t += 1
+                term_t = compone[cant_t]
+                term_g = compone[cant_g]
+                lccsal.append(-(term_a + term_c + term_t + term_g))
+        tail = window[0]
+    return lccsal
+
+
+def lcc_simp(seq):
+    """Calculate Local Composition Complexity (LCC) for a sequence.
+
+    seq - an unambiguous DNA sequence (a string or Seq object)
+
+    Returns the Local Composition Complexity (LCC) value for the entire
+    sequence (as a float).
+
+    Reference:
+    Andrzej K Konopka (2005) Sequence Complexity and Composition
+    https://doi.org/10.1038/npg.els.0005260
+    """
+    wsize = len(seq)
+    upper = str(seq).upper()
+    l2 = math.log(2)
+    if "A" not in seq:
+        term_a = 0
+        # Check to avoid calculating the log of 0.
+    else:
+        term_a = ((upper.count("A")) / float(wsize)) * (
+            (math.log((upper.count("A")) / float(wsize))) / l2
+        )
+    if "C" not in seq:
+        term_c = 0
+    else:
+        term_c = ((upper.count("C")) / float(wsize)) * (
+            (math.log((upper.count("C")) / float(wsize))) / l2
+        )
+    if "T" not in seq:
+        term_t = 0
+    else:
+        term_t = ((upper.count("T")) / float(wsize)) * (
+            (math.log((upper.count("T")) / float(wsize))) / l2
+        )
+    if "G" not in seq:
+        term_g = 0
+    else:
+        term_g = ((upper.count("G")) / float(wsize)) * (
+            (math.log((upper.count("G")) / float(wsize))) / l2
+        )
+    return -(term_a + term_c + term_t + term_g)
diff --git a/code/lib/Bio/Sequencing/Ace.py b/code/lib/Bio/Sequencing/Ace.py
new file mode 100644
index 0000000..5b446b4
--- /dev/null
+++ b/code/lib/Bio/Sequencing/Ace.py
@@ -0,0 +1,594 @@
+# Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Parser for ACE files output by PHRAP.
+
+Written by Frank Kauff (fkauff@duke.edu) and
+Cymon J. Cox (cymon@duke.edu)
+
+Usage:
+
+There are two ways of reading an ace file:
+
+1. The function 'read' reads the whole file at once;
+2. The function 'parse' reads the file contig after contig.
+
+First option, parse whole ace file at once::
+
+        from Bio.Sequencing import Ace
+        acefilerecord = Ace.read(open('my_ace_file.ace'))
+
+This gives you:
+ - acefilerecord.ncontigs (the number of contigs in the ace file)
+ - acefilerecord.nreads (the number of reads in the ace file)
+ - acefilerecord.contigs[] (one instance of the Contig class for each contig)
+
+The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
+for this contig in a list of instances of the Read class, e.g.::
+
+        contig3 = acefilerecord.contigs[2]
+        read4 = contig3.reads[3]
+        RD_of_read4 = read4.rd
+        DS_of_read4 = read4.ds
+
+CT, WA, RT tags from the end of the file can appear anywhere are automatically
+sorted into the right place.
+
+see _RecordConsumer for details.
+
+The second option is to  iterate over the contigs of an ace file one by one
+in the ususal way::
+
+    from Bio.Sequencing import Ace
+    contigs = Ace.parse(open('my_ace_file.ace'))
+    for contig in contigs:
+        print(contig.name)
+        ...
+
+Please note that for memory efficiency, when using the iterator approach, only one
+contig is kept in memory at once.  However, there can be a footer to the ACE file
+containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
+Because the parser doesn't see this data until the final record, it cannot be added to
+the appropriate records.  Instead these tags will be returned with the last contig record.
+Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
+are needed, the 'read' function rather than the 'parse' function might be more appropriate.
+"""
+
+
+class rd:
+    """RD (reads), store a read with its name, sequence etc.
+
+    The location and strand each read is mapped to is held in the AF lines.
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.name = ""
+        self.padded_bases = None
+        self.info_items = None
+        self.read_tags = None
+        self.sequence = ""
+
+
+class qa:
+    """QA (read quality), including which part if any was used as the consensus."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.qual_clipping_start = None
+        self.qual_clipping_end = None
+        self.align_clipping_start = None
+        self.align_clipping_end = None
+        if line:
+            header = line.split()
+            self.qual_clipping_start = int(header[1])
+            self.qual_clipping_end = int(header[2])
+            self.align_clipping_start = int(header[3])
+            self.align_clipping_end = int(header[4])
+
+
+class ds:
+    """DS lines, include file name of a read's chromatogram file."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.chromat_file = ""
+        self.phd_file = ""
+        self.time = ""
+        self.chem = ""
+        self.dye = ""
+        self.template = ""
+        self.direction = ""
+        if line:
+            tags = [
+                "CHROMAT_FILE",
+                "PHD_FILE",
+                "TIME",
+                "CHEM",
+                "DYE",
+                "TEMPLATE",
+                "DIRECTION",
+            ]
+            poss = [line.find(x) for x in tags]
+            tagpos = dict(zip(poss, tags))
+            if -1 in tagpos:
+                del tagpos[-1]
+            ps = sorted(tagpos)  # the keys
+            for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]):
+                setattr(
+                    self,
+                    tagpos[p1].lower(),
+                    line[p1 + len(tagpos[p1]) + 1 : p2].strip(),
+                )
+
+
+class af:
+    """AF lines, define the location of the read within the contig.
+
+    Note attribute coru is short for complemented (C) or uncomplemented (U),
+    since the strand information is stored in an ACE file using either the
+    C or U character.
+    """
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.name = ""
+        self.coru = None
+        self.padded_start = None
+        if line:
+            header = line.split()
+            self.name = header[1]
+            self.coru = header[2]
+            self.padded_start = int(header[3])
+
+
+class bs:
+    """BS (base segment), which read was chosen as the consensus at each position."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.name = ""
+        self.padded_start = None
+        self.padded_end = None
+        if line:
+            header = line.split()
+            self.padded_start = int(header[1])
+            self.padded_end = int(header[2])
+            self.name = header[3]
+
+
+class rt:
+    """RT (transient read tags), generated by crossmatch and phrap."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.name = ""
+        self.tag_type = ""
+        self.program = ""
+        self.padded_start = None
+        self.padded_end = None
+        self.date = ""
+        self.comment = []
+        if line:
+            header = line.split()
+            self.name = header[0]
+            self.tag_type = header[1]
+            self.program = header[2]
+            self.padded_start = int(header[3])
+            self.padded_end = int(header[4])
+            self.date = header[5]
+
+
+class ct:
+    """CT (consensus tags)."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.name = ""
+        self.tag_type = ""
+        self.program = ""
+        self.padded_start = None
+        self.padded_end = None
+        self.date = ""
+        self.notrans = ""
+        self.info = []
+        self.comment = []
+        if line:
+            header = line.split()
+            self.name = header[0]
+            self.tag_type = header[1]
+            self.program = header[2]
+            self.padded_start = int(header[3])
+            self.padded_end = int(header[4])
+            self.date = header[5]
+            if len(header) == 7:
+                self.notrans = header[6]
+
+
+class wa:
+    """WA (whole assembly tag), holds the assembly program name, version, etc."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.tag_type = ""
+        self.program = ""
+        self.date = ""
+        self.info = []
+        if line:
+            header = line.split()
+            self.tag_type = header[0]
+            self.program = header[1]
+            self.date = header[2]
+
+
+class wr:
+    """WR lines."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.name = ""
+        self.aligned = ""
+        self.program = ""
+        self.date = []
+        if line:
+            header = line.split()
+            self.name = header[0]
+            self.aligned = header[1]
+            self.program = header[2]
+            self.date = header[3]
+
+
+class Reads:
+    """Holds information about a read supporting an ACE contig."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.rd = None  # one per read
+        self.qa = None  # one per read
+        self.ds = None  # none or one per read
+        self.rt = None  # none or many per read
+        self.wr = None  # none or many per read
+        if line:
+            self.rd = rd()
+            header = line.split()
+            self.rd.name = header[1]
+            self.rd.padded_bases = int(header[2])
+            self.rd.info_items = int(header[3])
+            self.rd.read_tags = int(header[4])
+
+
+class Contig:
+    """Holds information about a contig from an ACE record."""
+
+    def __init__(self, line=None):
+        """Initialize the class."""
+        self.name = ""
+        self.nbases = None
+        self.nreads = None
+        self.nsegments = None
+        self.uorc = None
+        self.sequence = ""
+        self.quality = []
+        self.af = []
+        self.bs = []
+        self.reads = []
+        self.ct = None  # none or many
+        self.wa = None  # none or many
+        if line:
+            header = line.split()
+            self.name = header[1]
+            self.nbases = int(header[2])
+            self.nreads = int(header[3])
+            self.nsegments = int(header[4])
+            self.uorc = header[5]
+
+
+def parse(source):
+    """Iterate of ACE file contig by contig.
+
+    Argument source is a file-like object or a path to a file.
+
+    This function returns an iterator that allows you to iterate
+    over the ACE file record by record::
+
+        records = parse(source)
+        for record in records:
+            # do something with the record
+
+    where each record is a Contig object.
+    """
+    try:
+        handle = open(source)
+    except TypeError:
+        handle = source
+        if handle.read(0) != "":
+            raise ValueError("Ace files must be opened in text mode.") from None
+
+    try:
+        line = ""
+        while True:
+            # at beginning, skip the AS and look for first CO command
+            try:
+                while True:
+                    if line.startswith("CO"):
+                        break
+                    line = next(handle)
+            except StopIteration:
+                return
+
+            record = Contig(line)
+
+            for line in handle:
+                line = line.strip()
+                if not line:
+                    break
+                record.sequence += line
+
+            for line in handle:
+                if line.strip():
+                    break
+            if not line.startswith("BQ"):
+                raise ValueError("Failed to find BQ line")
+
+            for line in handle:
+                if not line.strip():
+                    break
+                record.quality.extend(int(x) for x in line.split())
+
+            for line in handle:
+                if line.strip():
+                    break
+
+            while True:
+                if not line.startswith("AF "):
+                    break
+                record.af.append(af(line))
+                try:
+                    line = next(handle)
+                except StopIteration:
+                    raise ValueError("Unexpected end of AF block") from None
+
+            while True:
+                if line.strip():
+                    break
+                try:
+                    line = next(handle)
+                except StopIteration:
+                    raise ValueError("Unexpected end of file") from None
+
+            while True:
+                if not line.startswith("BS "):
+                    break
+                record.bs.append(bs(line))
+                try:
+                    line = next(handle)
+                except StopIteration:
+                    raise ValueError("Failed to find end of BS block") from None
+
+            # now read all the read data
+            # it starts with a 'RD', and then a mandatory QA
+            # then follows an optional DS
+            # CT,RT,WA,WR may or may not be there in unlimited quantity.
+            # They might refer to the actual read or contig, or, if
+            # encountered at the end of file, to any previous read or contig.
+            # The sort() method deals with that later.
+            while True:
+
+                # each read must have a rd and qa
+                try:
+                    while True:
+                        # If I've met the condition, then stop reading the line.
+                        if line.startswith("RD "):
+                            break
+                        line = next(handle)
+                except StopIteration:
+                    raise ValueError("Failed to find RD line") from None
+
+                record.reads.append(Reads(line))
+
+                for line in handle:
+                    line = line.strip()
+                    if not line:
+                        break
+                    record.reads[-1].rd.sequence += line
+
+                for line in handle:
+                    if line.strip():
+                        break
+                if not line.startswith("QA "):
+                    raise ValueError("Failed to find QA line")
+                record.reads[-1].qa = qa(line)
+
+                # now one ds can follow
+                for line in handle:
+                    if line.strip():
+                        break
+                else:
+                    break
+
+                if line.startswith("DS "):
+                    record.reads[-1].ds = ds(line)
+                    line = ""
+                # the file could just end, or there's some more stuff.
+                # In ace files, anything can happen.
+                # the following tags are interspersed between reads and can appear multiple times.
+                while True:
+                    # something left
+                    try:
+                        while True:
+                            if line.strip():
+                                break
+                            line = next(handle)
+                    except StopIteration:
+                        # file ends here
+                        break
+                    if line.startswith("RT{"):
+                        # now if we're at the end of the file, this rt could
+                        # belong to a previous read, not the actual one.
+                        # we store it here were it appears, the user can sort later.
+                        if record.reads[-1].rt is None:
+                            record.reads[-1].rt = []
+                        for line in handle:
+                            line = line.strip()
+                            # if line=="COMMENT{":
+                            if line.startswith("COMMENT{"):
+                                if line[8:].strip():
+                                    # MIRA 3.0.5 would miss the new line out :(
+                                    record.reads[-1].rt[-1].comment.append(line[8:])
+                                for line in handle:
+                                    line = line.strip()
+                                    if line.endswith("C}"):
+                                        break
+                                    record.reads[-1].rt[-1].comment.append(line)
+                            elif line == "}":
+                                break
+                            else:
+                                record.reads[-1].rt.append(rt(line))
+                        line = ""
+                    elif line.startswith("WR{"):
+                        if record.reads[-1].wr is None:
+                            record.reads[-1].wr = []
+                        for line in handle:
+                            line = line.strip()
+                            if line == "}":
+                                break
+                            record.reads[-1].wr.append(wr(line))
+                        line = ""
+                    elif line.startswith("WA{"):
+                        if record.wa is None:
+                            record.wa = []
+                        try:
+                            line = next(handle)
+                        except StopIteration:
+                            raise ValueError("Failed to read WA block") from None
+                        record.wa.append(wa(line))
+                        for line in handle:
+                            line = line.strip()
+                            if line == "}":
+                                break
+                            record.wa[-1].info.append(line)
+                        line = ""
+                    elif line.startswith("CT{"):
+                        if record.ct is None:
+                            record.ct = []
+                        try:
+                            line = next(handle)
+                        except StopIteration:
+                            raise ValueError("Failed to read CT block") from None
+                        record.ct.append(ct(line))
+                        for line in handle:
+                            line = line.strip()
+                            if line == "COMMENT{":
+                                for line in handle:
+                                    line = line.strip()
+                                    if line.endswith("C}"):
+                                        break
+                                    record.ct[-1].comment.append(line)
+                            elif line == "}":
+                                break
+                            else:
+                                record.ct[-1].info.append(line)
+                        line = ""
+                    else:
+                        break
+
+                if not line.startswith("RD"):  # another read?
+                    break
+
+            yield record
+
+    finally:
+        if handle is not source:
+            handle.close()
+
+
+class ACEFileRecord:
+    """Holds data of an ACE file."""
+
+    def __init__(self):
+        """Initialize the class."""
+        self.ncontigs = None
+        self.nreads = None
+        self.contigs = []
+        self.wa = None  # none or many
+
+    def sort(self):
+        """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible."""
+        ct = []
+        rt = []
+        wr = []
+        # search for tags that aren't in the right position
+        for i, c in enumerate(self.contigs):
+            if c.wa:
+                if not self.wa:
+                    self.wa = []
+                self.wa.extend(c.wa)
+            if c.ct:
+                newcts = [ct_tag for ct_tag in c.ct if ct_tag.name != c.name]
+                for x in newcts:
+                    self.contigs[i].ct.remove(x)
+                ct.extend(newcts)
+            for j, r in enumerate(c.reads):
+                if r.rt:
+                    newrts = [rt_tag for rt_tag in r.rt if rt_tag.name != r.rd.name]
+                    for x in newrts:
+                        self.contigs[i].reads[j].rt.remove(x)
+                    rt.extend(newrts)
+                if r.wr:
+                    newwrs = [wr_tag for wr_tag in r.wr if wr_tag.name != r.rd.name]
+                    for x in newwrs:
+                        self.contigs[i].reads[j].wr.remove(x)
+                    wr.extend(newwrs)
+        # now sort them into their proper place
+        for i, c in enumerate(self.contigs):
+            for ct_tag in ct:
+                if ct_tag.name == c.name:
+                    if self.contigs[i].ct is None:
+                        self.contigs[i].ct = []
+                    self.contigs[i].ct.append(ct_tag)
+            if rt or wr:
+                for j, r in enumerate(c.reads):
+                    for rt_tag in rt:
+                        if rt_tag.name == r.rd.name:
+                            if self.contigs[i].reads[j].rt is None:
+                                self.contigs[i].reads[j].rt = []
+                            self.contigs[i].reads[j].rt.append(rt_tag)
+                    for wr_tag in wr:
+                        if wr_tag.name == r.rd.name:
+                            if self.contigs[i].reads[j].wr is None:
+                                self.contigs[i].reads[j].wr = []
+                            self.contigs[i].reads[j].wr.append(wr_tag)
+
+
+def read(handle):
+    """Parse a full ACE file into a list of contigs."""
+    handle = iter(handle)
+
+    record = ACEFileRecord()
+
+    try:
+        line = next(handle)
+    except StopIteration:
+        raise ValueError("Premature end of file") from None
+
+    # check if the file starts correctly
+    if not line.startswith("AS"):
+        raise ValueError("File does not start with 'AS'.")
+
+    words = line.split()
+    record.ncontigs = int(words[1])
+    record.nreads = int(words[2])
+
+    # now read all the records
+    record.contigs = list(parse(handle))
+    # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?).
+    # If the iterator is used, the tags are returned with the contig or the read after which they appear,
+    # if all tags are at the end, they are read with the last contig. The concept of an
+    # iterator leaves no other choice. But if the user uses the ACEParser, we can check
+    # them and put them into the appropriate contig/read instance.
+    # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable...
+    record.sort()
+    return record
diff --git a/code/lib/Bio/Sequencing/Applications/_Novoalign.py b/code/lib/Bio/Sequencing/Applications/_Novoalign.py
new file mode 100644
index 0000000..97a0a44
--- /dev/null
+++ b/code/lib/Bio/Sequencing/Applications/_Novoalign.py
@@ -0,0 +1,217 @@
+# Copyright 2009 by Osvaldo Zagordi.  All rights reserved.
+# Revisions copyright 2010 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the short read aligner Novoalign by Novocraft."""
+
+
+from Bio.Application import _Option, AbstractCommandline
+
+
+class NovoalignCommandline(AbstractCommandline):
+    """Command line wrapper for novoalign by Novocraft.
+
+    See www.novocraft.com - novoalign is a short read alignment program.
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import NovoalignCommandline
+    >>> novoalign_cline = NovoalignCommandline(database='some_db',
+    ...                                        readfile='some_seq.txt')
+    >>> print(novoalign_cline)
+    novoalign -d some_db -f some_seq.txt
+
+    As with all the Biopython application wrappers, you can also add or
+    change options after creating the object:
+
+    >>> novoalign_cline.format = 'PRBnSEQ'
+    >>> novoalign_cline.r_method='0.99' # limited valid values
+    >>> novoalign_cline.fragment = '250 20' # must be given as a string
+    >>> novoalign_cline.miRNA = 100
+    >>> print(novoalign_cline)
+    novoalign -d some_db -f some_seq.txt -F PRBnSEQ -r 0.99 -i 250 20 -m 100
+
+    You would typically run the command line with novoalign_cline() or via
+    the Python subprocess module, as described in the Biopython tutorial.
+
+    Last checked against version: 2.05.04
+
+    """
+
+    def __init__(self, cmd="novoalign", **kwargs):
+        """Initialize the class."""
+        READ_FORMAT = ["FA", "SLXFQ", "STDFQ", "ILMFQ", "PRB", "PRBnSEQ"]
+        REPORT_FORMAT = ["Native", "Pairwise", "SAM"]
+        REPEAT_METHOD = ["None", "Random", "All", "Exhaustive", "0.99"]
+
+        self.parameters = [
+            _Option(
+                ["-d", "database"], "database filename", filename=True, equate=False
+            ),
+            _Option(["-f", "readfile"], "read file", filename=True, equate=False),
+            _Option(
+                ["-F", "format"],
+                "Format of read files.\n\nAllowed values: %s" % ", ".join(READ_FORMAT),
+                checker_function=lambda x: x in READ_FORMAT,
+                equate=False,
+            ),
+            # Alignment scoring options
+            _Option(
+                ["-t", "threshold"],
+                "Threshold for alignment score",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-g", "gap_open"],
+                "Gap opening penalty [default: 40]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-x", "gap_extend"],
+                "Gap extend penalty [default: 15]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-u", "unconverted"],
+                "Experimental: unconverted cytosines penalty in bisulfite mode\n\n"
+                "Default: no penalty",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # Quality control and read filtering
+            _Option(
+                ["-l", "good_bases"],
+                "Minimum number of good quality bases [default: log(N_g, 4) + 5]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-h", "homopolymer"],
+                "Homopolymer read filter [default: 20; disable: negative value]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # Read preprocessing options
+            _Option(
+                ["-a", "adapter3"],
+                "Strips a 3' adapter sequence prior to alignment.\n\n"
+                "With paired ends two adapters can be specified",
+                checker_function=lambda x: isinstance(x, str),
+                equate=False,
+            ),
+            _Option(
+                ["-n", "truncate"],
+                "Truncate to specific length before alignment",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-s", "trimming"],
+                "If fail to align, trim by s bases until they map or become shorter than l.\n\n"
+                "Ddefault: 2",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-5", "adapter5"],
+                "Strips a 5' adapter sequence.\n\n"
+                "Similar to -a (adaptor3), but on the 5' end.",
+                checker_function=lambda x: isinstance(x, str),
+                equate=False,
+            ),
+            # Reporting options
+            _Option(
+                ["-o", "report"],
+                "Specifies the report format.\n\nAllowed values: %s\nDefault: Native"
+                % ", ".join(REPORT_FORMAT),
+                checker_function=lambda x: x in REPORT_FORMAT,
+                equate=False,
+            ),
+            _Option(
+                ["-Q", "quality"],
+                "Lower threshold for an alignment to be reported [default: 0]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-R", "repeats"],
+                "If score difference is higher, report repeats.\n\n"
+                "Otherwise -r read method applies [default: 5]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-r", "r_method"],
+                "Methods to report reads with multiple matches.\n\n"
+                "Allowed values: %s\n"
+                "'All' and 'Exhaustive' accept limits." % ", ".join(REPEAT_METHOD),
+                checker_function=lambda x: x.split()[0] in REPEAT_METHOD,
+                equate=False,
+            ),
+            _Option(
+                ["-e", "recorded"],
+                "Alignments recorded with score equal to the best.\n\n"
+                "Default: 1000 in default read method, otherwise no limit.",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-q", "qual_digits"],
+                "Decimal digits for quality scores [default: 0]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # Paired end options
+            _Option(
+                ["-i", "fragment"],
+                "Fragment length (2 reads + insert) and standard deviation [default: 250 30]",
+                checker_function=lambda x: len(x.split()) == 2,
+                equate=False,
+            ),
+            _Option(
+                ["-v", "variation"],
+                "Structural variation penalty [default: 70]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # miRNA mode
+            _Option(
+                ["-m", "miRNA"],
+                "Sets miRNA mode and optionally sets a value for the region scanned [default: off]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # Multithreading
+            _Option(
+                ["-c", "cores"],
+                "Number of threads, disabled on free versions [default: number of cores]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # Quality calibrations
+            _Option(
+                ["-k", "read_cal"],
+                "Read quality calibration from file (mismatch counts)",
+                checker_function=lambda x: isinstance(x, str),
+                equate=False,
+            ),
+            _Option(
+                ["-K", "write_cal"],
+                "Accumulate mismatch counts and write to file",
+                checker_function=lambda x: isinstance(x, str),
+                equate=False,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/Sequencing/Applications/__init__.py b/code/lib/Bio/Sequencing/Applications/__init__.py
new file mode 100644
index 0000000..e53d906
--- /dev/null
+++ b/code/lib/Bio/Sequencing/Applications/__init__.py
@@ -0,0 +1,56 @@
+# Copyright 2009 by Osvaldo Zagordi.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Sequencing related command line application wrappers (OBSOLETE).
+
+We have decided to remove this module in future, and instead recommend
+building your command and invoking it via the subprocess module directly.
+"""
+
+from ._Novoalign import NovoalignCommandline
+from ._bwa import BwaIndexCommandline, BwaAlignCommandline, BwaSamseCommandline
+from ._bwa import BwaSampeCommandline, BwaBwaswCommandline, BwaMemCommandline
+from ._samtools import SamtoolsViewCommandline, SamtoolsCalmdCommandline
+from ._samtools import SamtoolsCatCommandline, SamtoolsFaidxCommandline
+from ._samtools import SamtoolsFixmateCommandline, SamtoolsIdxstatsCommandline
+from ._samtools import SamtoolsIndexCommandline, SamtoolsMergeCommandline
+from ._samtools import SamtoolsMpileupCommandline, SamtoolsPhaseCommandline
+from ._samtools import SamtoolsReheaderCommandline, SamtoolsRmdupCommandline
+from ._samtools import (
+    SamtoolsVersion0xSortCommandline,
+    SamtoolsVersion1xSortCommandline,
+    SamtoolsTargetcutCommandline,
+)
+from ._samtools import SamtoolsVersion0xSortCommandline as SamtoolsSortCommandline
+
+
+# Make this explicit, then they show up in the API docs
+__all__ = (
+    "BwaIndexCommandline",
+    "BwaAlignCommandline",
+    "BwaSamseCommandline",
+    "BwaSampeCommandline",
+    "BwaBwaswCommandline",
+    "BwaMemCommandline",
+    "NovoalignCommandline",
+    "SamtoolsViewCommandline",
+    "SamtoolsCalmdCommandline",
+    "SamtoolsCatCommandline",
+    "SamtoolsFaidxCommandline",
+    "SamtoolsFixmateCommandline",
+    "SamtoolsIdxstatsCommandline",
+    "SamtoolsIndexCommandline",
+    "SamtoolsMergeCommandline",
+    "SamtoolsMpileupCommandline",
+    "SamtoolsPhaseCommandline",
+    "SamtoolsReheaderCommandline",
+    "SamtoolsRmdupCommandline",
+    "SamtoolsSortCommandline",
+    "SamtoolsVersion0xSortCommandline",
+    "SamtoolsVersion1xSortCommandline",
+    "SamtoolsTargetcutCommandline",
+)
diff --git a/code/lib/Bio/Sequencing/Applications/__pycache__/_Novoalign.cpython-37.pyc b/code/lib/Bio/Sequencing/Applications/__pycache__/_Novoalign.cpython-37.pyc
new file mode 100644
index 0000000..23aa246
Binary files /dev/null and b/code/lib/Bio/Sequencing/Applications/__pycache__/_Novoalign.cpython-37.pyc differ
diff --git a/code/lib/Bio/Sequencing/Applications/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Sequencing/Applications/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..8a77d3b
Binary files /dev/null and b/code/lib/Bio/Sequencing/Applications/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Sequencing/Applications/__pycache__/_bwa.cpython-37.pyc b/code/lib/Bio/Sequencing/Applications/__pycache__/_bwa.cpython-37.pyc
new file mode 100644
index 0000000..744a837
Binary files /dev/null and b/code/lib/Bio/Sequencing/Applications/__pycache__/_bwa.cpython-37.pyc differ
diff --git a/code/lib/Bio/Sequencing/Applications/__pycache__/_samtools.cpython-37.pyc b/code/lib/Bio/Sequencing/Applications/__pycache__/_samtools.cpython-37.pyc
new file mode 100644
index 0000000..4563761
Binary files /dev/null and b/code/lib/Bio/Sequencing/Applications/__pycache__/_samtools.cpython-37.pyc differ
diff --git a/code/lib/Bio/Sequencing/Applications/_bwa.py b/code/lib/Bio/Sequencing/Applications/_bwa.py
new file mode 100644
index 0000000..f07ec6a
--- /dev/null
+++ b/code/lib/Bio/Sequencing/Applications/_bwa.py
@@ -0,0 +1,640 @@
+# Copyright 2013 Saket Choudhary.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for bwa."""
+
+from Bio.Application import _Option, _Argument, _Switch, AbstractCommandline
+from Bio.Application import _StaticArgument
+
+
+class BwaIndexCommandline(AbstractCommandline):
+    """Command line wrapper for Burrows Wheeler Aligner (BWA) index.
+
+    Index database sequences in the FASTA format, equivalent to::
+
+        $ bwa index [-p prefix] [-a algoType] [-c] 
+
+    See http://bio-bwa.sourceforge.net/bwa.shtml for details.
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import BwaIndexCommandline
+    >>> reference_genome = "/path/to/reference_genome.fasta"
+    >>> index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw")
+    >>> print(index_cmd)
+    bwa index -a bwtsw /path/to/reference_genome.fasta
+
+    You would typically run the command using index_cmd() or via the
+    Python subprocess module, as described in the Biopython tutorial.
+
+    """
+
+    def __init__(self, cmd="bwa", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("index"),
+            _Option(
+                ["-a", "a", "algorithm"],
+                """Algorithm for constructing BWT index.
+
+                    Available options are:
+                        - is:    IS linear-time algorithm for constructing suffix array.
+                          It requires 5.37N memory where N is the size of the database.
+                          IS is moderately fast, but does not work with database larger
+                          than 2GB. IS is the default algorithm due to its simplicity.
+                        - bwtsw: Algorithm implemented in BWT-SW. This method works with the
+                          whole human genome, but it does not work with database
+                          smaller than 10MB and it is usually slower than IS.""",
+                checker_function=lambda x: x in ["is", "bwtsw"],
+                equate=False,
+                is_required=True,
+            ),
+            _Option(
+                ["-p", "p", "prefix"],
+                "Prefix of the output database [same as db filename]",
+                equate=False,
+                is_required=False,
+            ),
+            _Argument(["infile"], "Input file name", filename=True, is_required=True),
+            _Switch(
+                ["-c", "c"],
+                "Build color-space index. The input fasta should be in nucleotide space.",
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class BwaAlignCommandline(AbstractCommandline):
+    """Command line wrapper for Burrows Wheeler Aligner (BWA) aln.
+
+    Run a BWA alignment, equivalent to::
+
+        $ bwa aln [...]   > 
+
+    See http://bio-bwa.sourceforge.net/bwa.shtml for details.
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import BwaAlignCommandline
+    >>> reference_genome = "/path/to/reference_genome.fasta"
+    >>> read_file = "/path/to/read_1.fq"
+    >>> output_sai_file = "/path/to/read_1.sai"
+    >>> align_cmd = BwaAlignCommandline(reference=reference_genome, read_file=read_file)
+    >>> print(align_cmd)
+    bwa aln /path/to/reference_genome.fasta /path/to/read_1.fq
+
+    You would typically run the command line using align_cmd(stdout=output_sai_file)
+    or via the Python subprocess module, as described in the Biopython tutorial.
+
+    """
+
+    def __init__(self, cmd="bwa", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("aln"),
+            _Argument(
+                ["reference"], "Reference file name", filename=True, is_required=True
+            ),
+            _Argument(["read_file"], "Read file name", filename=True, is_required=True),
+            _Option(
+                ["-n", "n"],
+                "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]",
+                checker_function=lambda x: isinstance(x, (int, float)),
+                equate=False,
+            ),
+            _Option(
+                ["-o", "o"],
+                "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]",
+                checker_function=lambda x: isinstance(x, (int, float)),
+                equate=False,
+            ),
+            _Option(
+                ["-e", "e"],
+                "Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-d", "d"],
+                "Disallow a long deletion within INT bp towards the 3-end [16]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-i", "i"],
+                "Disallow an indel within INT bp towards the ends [5]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-l", "l"],
+                """Take the first INT subsequence as seed.
+
+                    If INT is larger than the query sequence, seeding will be disabled.
+                    For long reads, this option is typically ranged from 25 to 35 for
+                    -k 2. [inf]""",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-k", "k"],
+                "Maximum edit distance in the seed [2]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-t", "t"],
+                "Number of threads (multi-threading mode) [1]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-M", "M"],
+                "Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-O", "O"],
+                "Gap open penalty [11]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-E", "E"],
+                "Gap extension penalty [4]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-R", "R"],
+                """Proceed with suboptimal alignments if there are no more than INT equally best hits.
+
+                    This option only affects paired-end mapping. Increasing this threshold helps
+                    to improve the pairing accuracy at the cost of speed, especially for short
+                    reads (~32bp).""",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-q", "q"],
+                r"""Parameter for read trimming [0].
+
+                    BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l   > 
+
+    See http://bio-bwa.sourceforge.net/bwa.shtml for details.
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import BwaSamseCommandline
+    >>> reference_genome = "/path/to/reference_genome.fasta"
+    >>> read_file = "/path/to/read_1.fq"
+    >>> sai_file = "/path/to/read_1.sai"
+    >>> output_sam_file = "/path/to/read_1.sam"
+    >>> samse_cmd = BwaSamseCommandline(reference=reference_genome,
+    ...                                 read_file=read_file, sai_file=sai_file)
+    >>> print(samse_cmd)
+    bwa samse /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_1.fq
+
+    You would typically run the command line using samse_cmd(stdout=output_sam_file)
+    or via the Python subprocess module, as described in the Biopython tutorial.
+
+    """
+
+    def __init__(self, cmd="bwa", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("samse"),
+            _Argument(
+                ["reference"], "Reference file name", filename=True, is_required=True
+            ),
+            _Argument(["sai_file"], "Sai file name", filename=True, is_required=True),
+            _Argument(
+                ["read_file"], "Read  file name", filename=True, is_required=True
+            ),
+            _Option(
+                ["-n", "n"],
+                """Maximum number of alignments to output in the XA tag for reads paired properly.
+
+                    If a read has more than INT hits, the XA tag will not be written. [3]""",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-r", "r"],
+                "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class BwaSampeCommandline(AbstractCommandline):
+    r"""Command line wrapper for Burrows Wheeler Aligner (BWA) sampe.
+
+    Generate alignments in the SAM format given paired-end reads.
+    Equivalent to::
+
+        $ bwa sampe [...]      > 
+
+    See http://bio-bwa.sourceforge.net/bwa.shtml for details.
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import BwaSampeCommandline
+    >>> reference_genome = "/path/to/reference_genome.fasta"
+    >>> read_file1 = "/path/to/read_1.fq"
+    >>> read_file2 = "/path/to/read_2.fq"
+    >>> sai_file1 = "/path/to/read_1.sai"
+    >>> sai_file2 = "/path/to/read_2.sai"
+    >>> output_sam_file = "/path/to/output.sam"
+    >>> read_group = r"@RG\tID:foo\tSM:bar"  # BWA will turn backslash-t into tab
+    >>> sampe_cmd = BwaSampeCommandline(reference=reference_genome,
+    ...                                 sai_file1=sai_file1, sai_file2=sai_file2,
+    ...                                 read_file1=read_file1, read_file2=read_file2,
+    ...                                 r=read_group)
+    >>> print(sampe_cmd)
+    bwa sampe /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_2.sai /path/to/read_1.fq /path/to/read_2.fq -r @RG\tID:foo\tSM:bar
+
+    You would typically run the command line using sampe_cmd(stdout=output_sam_file)
+    or via the Python subprocess module, as described in the Biopython tutorial.
+
+    """
+
+    # TODO - Should the read group have a raw tab in it, or \t?
+
+    def __init__(self, cmd="bwa", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("sampe"),
+            _Argument(
+                ["reference"], "Reference file name", filename=True, is_required=True
+            ),
+            _Argument(["sai_file1"], "Sai file 1", filename=True, is_required=True),
+            _Argument(["sai_file2"], "Sai file 2", filename=True, is_required=True),
+            _Argument(["read_file1"], "Read  file 1", filename=True, is_required=True),
+            _Argument(["read_file2"], "Read  file 2", filename=True, is_required=True),
+            _Option(
+                ["-a", "a"],
+                """Maximum insert size for a read pair to be considered being mapped properly [500].
+
+                    Since 0.4.5, this option is only used when there are not enough
+                    good alignments to infer the distribution of insert sizes.""",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-o", "o"],
+                """Maximum occurrences of a read for pairing [100000].
+
+                        A read with more occurrences will be treated as a single-end read.
+                        Reducing this parameter helps faster pairing.""",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-n", "n"],
+                """Maximum number of alignments to output in the XA tag for reads paired properly [3].
+
+                    If a read has more than INT hits, the XA tag will not be written.""",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-N", "N"],
+                """Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) [10].
+
+                    If a read has more than INT hits, the XA tag will not be written.""",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-r", "r"],
+                "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]",
+                checker_function=lambda x: isinstance(x, str),
+                equate=False,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class BwaBwaswCommandline(AbstractCommandline):
+    """Command line wrapper for Burrows Wheeler Aligner (BWA) bwasw.
+
+    Align query sequences from FASTQ files. Equivalent to::
+
+        $ bwa bwasw [...]  
+
+    See http://bio-bwa.sourceforge.net/bwa.shtml for details.
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import BwaBwaswCommandline
+    >>> reference_genome = "/path/to/reference_genome.fasta"
+    >>> read_file = "/path/to/read_1.fq"
+    >>> bwasw_cmd = BwaBwaswCommandline(reference=reference_genome, read_file=read_file)
+    >>> print(bwasw_cmd)
+    bwa bwasw /path/to/reference_genome.fasta /path/to/read_1.fq
+
+    You would typically run the command line using bwasw_cmd() or via the
+    Python subprocess module, as described in the Biopython tutorial.
+
+    """
+
+    def __init__(self, cmd="bwa", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("bwasw"),
+            _Argument(
+                ["reference"], "Reference file name", filename=True, is_required=True
+            ),
+            _Argument(["read_file"], "Read file", filename=True, is_required=True),
+            _Argument(["mate_file"], "Mate file", filename=True, is_required=False),
+            _Option(
+                ["-a", "a"],
+                "Score of a match [1]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-b", "b"],
+                "Mismatch penalty [3]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-q", "q"],
+                "Gap open penalty [5]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-r", "r"],
+                "Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-t", "t"],
+                "Number of threads in the multi-threading mode [1]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-w", "w"],
+                "Band width in the banded alignment [33]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-T", "T"],
+                "Minimum score threshold divided by a [37]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-c", "c"],
+                """Coefficient for threshold adjustment according to query length [5.5].
+
+                    Given an l-long query, the threshold for a hit to be retained is
+                    a*max{T,c*log(l)}.""",
+                checker_function=lambda x: isinstance(x, float),
+                equate=False,
+            ),
+            _Option(
+                ["-z", "z"],
+                "Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-s", "s"],
+                """Maximum SA interval size for initiating a seed [3].
+
+                    Higher -s increases accuracy at the cost of speed.""",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-N", "N"],
+                "Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class BwaMemCommandline(AbstractCommandline):
+    """Command line wrapper for Burrows Wheeler Aligner (BWA) mem.
+
+    Run a BWA-MEM alignment, with single- or paired-end reads, equivalent to::
+
+        $ bwa mem [...]    > 
+
+    See http://bio-bwa.sourceforge.net/bwa.shtml for details.
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import BwaMemCommandline
+    >>> reference_genome = "/path/to/reference_genome.fasta"
+    >>> read_file = "/path/to/read_1.fq"
+    >>> output_sam_file = "/path/to/output.sam"
+    >>> align_cmd = BwaMemCommandline(reference=reference_genome, read_file1=read_file)
+    >>> print(align_cmd)
+    bwa mem /path/to/reference_genome.fasta /path/to/read_1.fq
+
+    You would typically run the command line using align_cmd(stdout=output_sam_file)
+    or via the Python subprocess module, as described in the Biopython tutorial.
+
+    """
+
+    def __init__(self, cmd="bwa", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("mem"),
+            _Argument(
+                ["reference"], "Reference file name", filename=True, is_required=True
+            ),
+            _Argument(
+                ["read_file1"], "Read 1 file name", filename=True, is_required=True
+            ),
+            _Argument(
+                ["read_file2"], "Read 2 file name", filename=True, is_required=False
+            ),
+            _Option(
+                ["-t", "t"],
+                "Number of threads [1]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-k", "k"],
+                "Minimum seed length. Matches shorter than INT will be missed. The alignment speed is usually insensitive to this value unless it significantly deviates 20. [19]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-w", "w"],
+                "Band width. Essentially, gaps longer than INT will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-d", "d"],
+                r"Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between the best and the current extension score is above \|i-j\|*A+INT, where i and j are the current positions of the query and reference, respectively, and A is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not only avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. [100]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-r", "r"],
+                "Trigger re-seeding for a MEM longer than minSeedLen*FLOAT. This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]",
+                checker_function=lambda x: isinstance(x, (int, float)),
+                equate=False,
+            ),
+            _Option(
+                ["-c", "c"],
+                "Discard a MEM if it has more than INT occurrence in the genome. This is an insensitive parameter. [10000]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-A", "A"],
+                "Matching score. [1]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-B", "B"],
+                "Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-O", "O"],
+                "Gap open penalty. [6]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-E", "E"],
+                "Gap extension penalty. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [1]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-L", "L"],
+                "Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [5]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-U", "U"],
+                "Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. [9] ",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-R", "R"],
+                "Complete read group header line. 't' can be used in STR and will be converted to a TAB in the output SAM. The read group ID will be attached to every read in the output. An example is '@RG\tID:foo\tSM:bar'. [null]",
+                checker_function=lambda x: isinstance(x, str),
+                equate=False,
+            ),
+            _Option(
+                ["-T", "T"],
+                "Don't output alignment with score lower than INT. This option only affects output. [30]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-v", "v"],
+                "Control the verbose level of the output. This option has not been fully supported throughout BWA. Ideally, a value 0 for disabling all the output to stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for all normal messages; 4 or higher for debugging. When this option takes value 4, the output is not SAM. [3]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Switch(
+                ["-P", "P"],
+                "In the paired-end mode, perform SW to rescue missing hits only but do not try to find hits that fit a proper pair.",
+            ),
+            _Switch(
+                ["-p", "p"],
+                "Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.",
+            ),
+            _Switch(
+                ["-a", "a"],
+                "Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments.",
+            ),
+            _Switch(
+                ["-C", "C"],
+                "Append FASTA/Q comment to SAM output. This option can be used to transfer read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment (the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.",
+            ),
+            _Switch(
+                ["-H", "H"],
+                "Use hard clipping 'H' in the SAM output. This option may dramatically reduce the redundancy of output when mapping long contig or BAC sequences.",
+            ),
+            _Switch(
+                ["-M", "M"],
+                "Mark shorter split hits as secondary (for Picard compatibility).",
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/Sequencing/Applications/_samtools.py b/code/lib/Bio/Sequencing/Applications/_samtools.py
new file mode 100644
index 0000000..546a358
--- /dev/null
+++ b/code/lib/Bio/Sequencing/Applications/_samtools.py
@@ -0,0 +1,1035 @@
+# Copyright 2014 Saket Choudhary. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for samtools."""
+# Last Checked with samtools [0.1.20 and 1.2]
+# TODO samtools 1.x has additional options over 0.x which
+# are missing from this wrapper
+
+
+from Bio.Application import _Option, _Argument, _Switch
+from Bio.Application import AbstractCommandline, _ArgumentList
+from Bio.Application import _StaticArgument
+
+
+class SamtoolsViewCommandline(AbstractCommandline):
+    """Command line wrapper for samtools view.
+
+    Extract/print all or sub alignments in SAM or BAM format, equivalent to::
+
+        $ samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag]
+                        [-F skipFlag] [-q minMapQ] [-l library] [-r readGroup]
+                        [-R rgFile] | [region1 [...]]
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsViewCommandline
+    >>> input_file = "/path/to/sam_or_bam_file"
+    >>> samtools_view_cmd = SamtoolsViewCommandline(input_file=input_file)
+    >>> print(samtools_view_cmd)
+    samtools view /path/to/sam_or_bam_file
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("view"),
+            _Switch(["-b", "b"], "Output in the BAM format"),
+            _Switch(
+                ["-c", "c"],
+                """Instead of printing the alignments, only count them and
+                    print the total number.
+
+                    All filter options, such as '-f', '-F' and '-q',
+                    are taken into account""",
+            ),
+            _Switch(["-h", "h"], "Include the header in the output"),
+            _Switch(
+                ["-u", "u"],
+                """Output uncompressed BAM.
+
+                    This option saves time spent on compression/decompression
+                    and is thus preferred when the output is piped to
+                    another samtools command""",
+            ),
+            _Switch(["-H", "H"], "Output the header only"),
+            _Switch(
+                ["-S", "S"],
+                """Input is in SAM.
+                    If @SQ header lines are absent,
+                    the '-t' option is required.""",
+            ),
+            _Option(
+                ["-t", "t"],
+                """This file is TAB-delimited.
+                    Each line must contain the reference name and the
+                    length of the reference, one line for each
+                    distinct reference; additional fields are ignored.
+
+                    This file also defines the order of the reference
+                    sequences in sorting.
+                    If you run   'samtools faidx ',
+                    the resultant index file .fai can be used
+                    as this  file.""",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-o", "o"],
+                "Output file",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-f", "f"],
+                """Only output alignments with all bits in
+                    INT present in the FLAG field""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-F", "F"],
+                "Skip alignments with bits present in INT",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-q", "q"],
+                "Skip alignments with MAPQ smaller than INT",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-r", "r"],
+                "Only output reads in read group STR",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-R", "R"],
+                "Output reads in read groups listed in FILE",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-l", "l"],
+                "Only output reads in library STR",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Switch(
+                ["-1", "fast_bam"],
+                "Use zlib compression level 1 to compress the output",
+            ),
+            _Argument(
+                ["input", "input_file"],
+                "Input File Name",
+                filename=True,
+                is_required=True,
+            ),
+            _Argument(["region"], "Region", is_required=False),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsMpileupCommandline(AbstractCommandline):
+    """Command line wrapper for samtools mpileup.
+
+    Generate BCF or pileup for one or multiple BAM files, equivalent to::
+
+        $ samtools mpileup [-EBug] [-C capQcoef] [-r reg] [-f in.fa]
+                           [-l list] [-M capMapQ] [-Q minBaseQ]
+                           [-q minMapQ] in.bam [in2.bam [...]]
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsMpileupCommandline
+    >>> input = ["/path/to/sam_or_bam_file"]
+    >>> samtools_mpileup_cmd = SamtoolsMpileupCommandline(input_file=input)
+    >>> print(samtools_mpileup_cmd)
+    samtools mpileup /path/to/sam_or_bam_file
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("mpileup"),
+            _Switch(
+                ["-E", "E"],
+                """Extended BAQ computation.
+                    This option helps sensitivity especially
+                    for MNPs, but may hurt specificity a little bit""",
+            ),
+            _Switch(
+                ["-B", "B"],
+                """Disable probabilistic realignment for the
+                    computation of base alignment quality (BAQ).
+
+                    BAQ is the Phred-scaled probability of a read base being
+                    misaligned.
+                    Applying this option greatly helps to reduce false SNPs
+                    caused by misalignments""",
+            ),
+            _Switch(
+                ["-g", "g"],
+                """Compute genotype likelihoods and output them in the
+                    binary call format (BCF)""",
+            ),
+            _Switch(
+                ["-u", "u"],
+                """Similar to -g except that the output is
+                    uncompressed BCF, which is preferred for piping""",
+            ),
+            _Option(
+                ["-C", "C"],
+                """Coefficient for downgrading mapping quality for
+                    reads containing excessive mismatches.
+
+                    Given a read with a phred-scaled probability q of
+                    being generated from the mapped position,
+                    the new mapping quality is about sqrt((INT-q)/INT)*INT.
+                    A zero value disables this functionality;
+                    if enabled, the recommended value for BWA is 50""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-r", "r"],
+                "Only generate pileup in region STR",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-f", "f"],
+                """The faidx-indexed reference file in the FASTA format.
+
+                    The file can be optionally compressed by razip""",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-l", "l"],
+                """BED or position list file containing a list of regions
+                    or sites where pileup or BCF should be generated""",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-M", "M"],
+                "Cap Mapping Quality at M",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-q", "q"],
+                "Minimum mapping quality for an alignment to be used",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-Q", "Q"],
+                "Minimum base quality for a base to be considered",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Switch(
+                ["-6", "illumina_13"],
+                "Assume the quality is in the Illumina 1.3+ encoding",
+            ),
+            _Switch(
+                ["-A", "A"], "Do not skip anomalous read pairs in variant calling."
+            ),
+            _Option(
+                ["-b", "b"],
+                "List of input BAM files, one file per line",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-d", "d"],
+                "At a position, read maximally INT reads per input BAM",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Switch(["-D", "D"], "Output per-sample read depth"),
+            _Switch(
+                ["-S", "S"],
+                """Output per-sample Phred-scaled
+                                strand bias P-value""",
+            ),
+            _Option(
+                ["-e", "e"],
+                """Phred-scaled gap extension sequencing error probability.
+
+                    Reducing INT leads to longer indels""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-h", "h"],
+                """Coefficient for modeling homopolymer errors.
+
+                    Given an l-long homopolymer run, the sequencing error
+                    of an indel of size s is modeled as INT*s/l""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Switch(["-I", "I"], "Do not perform INDEL calling"),
+            _Option(
+                ["-L", "L"],
+                """Skip INDEL calling if the average per-sample
+                    depth is above INT""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-o", "o"],
+                """Phred-scaled gap open sequencing error probability.
+
+                    Reducing INT leads to more indel calls.""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-p", "p"],
+                """Comma delimited list of platforms (determined by @RG-PL)
+                    from which indel candidates are obtained.
+
+                    It is recommended to collect indel candidates from
+                    sequencing technologies that have low indel error rate
+                    such as ILLUMINA""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _ArgumentList(
+                ["input_file"],
+                "Input File for generating mpileup",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsReheaderCommandline(AbstractCommandline):
+    """Command line wrapper for samtools reheader.
+
+    Replace the header in in.bam with the header
+    in in.header.sam, equivalent to::
+
+    $ samtools reheader  
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsReheaderCommandline
+    >>> input_header = "/path/to/header_sam_file"
+    >>> input_bam = "/path/to/input_bam_file"
+    >>> reheader_cmd = SamtoolsReheaderCommandline(input_header=input_header,
+    ...                                            input_bam=input_bam)
+    >>> print(reheader_cmd)
+    samtools reheader /path/to/header_sam_file /path/to/input_bam_file
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("reheader"),
+            _Argument(
+                ["input_header", "header_sam", "sam_file"],
+                "Sam file with header",
+                filename=True,
+                is_required=True,
+            ),
+            _Argument(
+                ["input_bam", "input_file", "bam_file"],
+                "BAM file for writing header to",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsCatCommandline(AbstractCommandline):
+    """Command line wrapper for samtools cat.
+
+    Concatenate BAMs, equivalent to::
+
+        $ samtools cat [-h header.sam] [-o out.bam]   [ ... ]
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsCatCommandline
+    >>> input_bam1 = "/path/to/input_bam1"
+    >>> input_bam2 = "/path/to/input_bam2"
+    >>> input_bams = [input_bam1, input_bam2]
+    >>> samtools_cat_cmd = SamtoolsCatCommandline(input_bam=input_bams)
+    >>> print(samtools_cat_cmd)
+    samtools cat /path/to/input_bam1 /path/to/input_bam2
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("cat"),
+            _Option(
+                ["-h", "h"],
+                "Header SAM file",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-o", "o"],
+                "Output SAM file",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _ArgumentList(
+                ["input", "input_bam", "bams"],
+                "Input BAM files",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsVersion0xSortCommandline(AbstractCommandline):
+    """Command line wrapper for samtools version 0.1.x sort.
+
+    Concatenate BAMs, equivalent to::
+
+    $ samtools sort [-no] [-m maxMem]  
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsVersion0xSortCommandline
+    >>> input_bam = "/path/to/input_bam"
+    >>> out_prefix = "/path/to/out_prefix"
+    >>> samtools_sort_cmd = SamtoolsVersion0xSortCommandline(input=input_bam, out_prefix=out_prefix)
+    >>> print(samtools_sort_cmd)
+    samtools sort /path/to/input_bam /path/to/out_prefix
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+
+        # options for version samtools 0.0.19
+        self.parameters = [
+            _StaticArgument("sort"),
+            _Switch(
+                ["-o", "o"],
+                """Output the final alignment
+                                    to the standard output""",
+            ),
+            _Switch(
+                ["-n", "n"],
+                """Sort by read names rather
+                                    than by chromosomal coordinates""",
+            ),
+            _Option(
+                ["-m", "m"],
+                "Approximately the maximum required memory",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Argument(["input"], "Input BAM file", filename=True, is_required=True),
+            _Argument(["out_prefix"], "Output prefix", filename=True, is_required=True),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsVersion1xSortCommandline(AbstractCommandline):
+    """Command line wrapper for samtools version 1.3.x sort.
+
+    Concatenate BAMs, equivalent to::
+
+    $ samtools sort [-n] [-T FREFIX] [-o file] [-I INT] [-m maxMem] 
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsVersion1xSortCommandline
+    >>> input_bam = "/path/to/input_bam"
+    >>> FREFIX = "/path/to/out_prefix"
+    >>> file_name = "/path/to/out_file"
+    >>> samtools_sort_cmd = SamtoolsVersion1xSortCommandline(input=input_bam, T=FREFIX, o=file_name)
+    >>> print(samtools_sort_cmd)
+    samtools sort -o /path/to/out_file -T /path/to/out_prefix /path/to/input_bam
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+
+        # options for version samtools 1.3.1
+        self.parameters = [
+            _StaticArgument("sort"),
+            _Switch(
+                ["-n", "n"],
+                """Sort by read names rather
+                                    than by chromosomal coordinates""",
+            ),
+            _Option(
+                ["-o", "o"],
+                """(file) Write the final sorted output to FILE,
+                    rather than to standard output""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-O", "O"],
+                """(FORMAT) Write the final output as sam, bam, or cram""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-T", "T"],
+                """(PREFIX) Write temporary files to PREFIX.nnnn.bam, or if the specified PREFIX
+                    is an existing directory, to PREFIX/samtools.mmm.mmm.tmp.nnnn.bam,
+                    where mmm is unique to this invocation of the sort command""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-I", "I"],
+                """(INT) Set the desired compression level for the final output file,
+                    ranging from 0 (uncompressed) or 1 (fastest but minimal compression)
+                    to 9 (best compression but slowest to write), similarly to gzip(1)'s compression level setting.""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-m", "m"],
+                "Approximately the maximum required memory",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Argument(
+                ["input"], "Input SAM/BAM/CRAM file", filename=True, is_required=True
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsMergeCommandline(AbstractCommandline):
+    """Command line wrapper for samtools merge.
+
+    Merge multiple sorted alignments, equivalent to::
+
+        $ samtools merge [-nur1f] [-h inh.sam] [-R reg]
+                            [...]
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsMergeCommandline
+    >>> out_bam = "/path/to/out_bam"
+    >>> in_bam = ["/path/to/input_bam1", "/path/to/input_bam2"]
+    >>> merge_cmd = SamtoolsMergeCommandline(out_bam=out_bam,
+    ...                                      input_bam=in_bam)
+    >>> print(merge_cmd)
+    samtools merge /path/to/out_bam /path/to/input_bam1 /path/to/input_bam2
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("merge"),
+            _Switch(
+                ["-n", "n"],
+                """The input alignments are sorted by read names
+                    rather than by chromosomal coordinates""",
+            ),
+            _Switch(
+                ["-r", "r"],
+                """Attach an RG tag to each alignment.
+                    The tag value is inferred from file names""",
+            ),
+            _Switch(["-u", "u"], "Uncompressed BAM output"),
+            _Switch(
+                ["-1", "fast_bam"],
+                """Use zlib compression level 1
+                                           to compress the output""",
+            ),
+            _Switch(
+                ["-f", "f"],
+                """Force to overwrite the
+                                    output file if present""",
+            ),
+            _Option(
+                ["-h", "h"],
+                """Use the lines of FILE as '@'
+                                    headers to be copied to out.bam""",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-R", "R"],
+                "Merge files in the specified region indicated by STR",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Argument(
+                ["output_bam", "out_bam", "out", "output"],
+                "Output BAM file",
+                filename=True,
+                is_required=True,
+            ),
+            _ArgumentList(
+                ["input_bam", "in_bam", "input", "bam"],
+                "Input BAM",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsIndexCommandline(AbstractCommandline):
+    """Command line wrapper for samtools index.
+
+    Index sorted alignment for fast random access, equivalent to::
+
+    $ samtools index 
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsIndexCommandline
+    >>> input = "/path/to/aln_bam"
+    >>> samtools_index_cmd = SamtoolsIndexCommandline(input_bam=input)
+    >>> print(samtools_index_cmd)
+    samtools index /path/to/aln_bam
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("index"),
+            _Argument(["input", "in_bam", "input_bam"], "BAM file to be indexed"),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsIdxstatsCommandline(AbstractCommandline):
+    """Command line wrapper for samtools idxstats.
+
+    Retrieve and print stats in the index file, equivalent to::
+
+    $ samtools idxstats 
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsIdxstatsCommandline
+    >>> input = "/path/to/aln_bam"
+    >>> samtools_idxstats_cmd = SamtoolsIdxstatsCommandline(input_bam=input)
+    >>> print(samtools_idxstats_cmd)
+    samtools idxstats /path/to/aln_bam
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("idxstats"),
+            _Argument(["input", "in_bam", "input_bam"], "BAM file to be indexed"),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsFaidxCommandline(AbstractCommandline):
+    """Command line wrapper for samtools faidx.
+
+    Retrieve and print stats in the index file, equivalent to::
+
+    $ samtools faidx  [region1 [...]]
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsFaidxCommandline
+    >>> reference = "/path/to/reference.fasta"
+    >>> samtools_faidx_cmd = SamtoolsFaidxCommandline(reference=reference)
+    >>> print(samtools_faidx_cmd)
+    samtools faidx /path/to/reference.fasta
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("faidx"),
+            _Argument(
+                ["reference", "reference_fasta", "ref"],
+                "Reference FASTA to be indexed",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsFixmateCommandline(AbstractCommandline):
+    """Command line wrapper for samtools fixmate.
+
+    Fill in mate coordinates, ISIZE and mate related
+    flags from a name-sorted alignment, equivalent to::
+
+    $ samtools fixmate  
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsFixmateCommandline
+    >>> in_bam = "/path/to/in.nameSrt.bam"
+    >>> out_bam = "/path/to/out.bam"
+    >>> fixmate_cmd = SamtoolsFixmateCommandline(input_bam=in_bam,
+    ...                                          out_bam=out_bam)
+    >>> print(fixmate_cmd)
+    samtools fixmate /path/to/in.nameSrt.bam /path/to/out.bam
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("fixmate"),
+            _Argument(
+                ["in_bam", "sorted_bam", "input_bam", "input", "input_file"],
+                "Name Sorted Alignment File ",
+                filename=True,
+                is_required=True,
+            ),
+            _Argument(
+                ["out_bam", "output_bam", "output", "output_file"],
+                "Output file",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsRmdupCommandline(AbstractCommandline):
+    """Command line wrapper for samtools rmdup.
+
+    Remove potential PCR duplicates, equivalent to::
+
+    $ samtools rmdup [-sS]  
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsRmdupCommandline
+    >>> input_sorted_bam = "/path/to/input.srt.bam"
+    >>> out_bam = "/path/to/out.bam"
+    >>> rmdup_cmd = SamtoolsRmdupCommandline(input_bam=input_sorted_bam,
+    ...                                      out_bam=out_bam)
+    >>> print(rmdup_cmd)
+    samtools rmdup /path/to/input.srt.bam /path/to/out.bam
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("rmdup"),
+            _Switch(
+                ["-s", "s"],
+                """Remove duplicates for single-end reads.
+
+                    By default, the command works for paired-end
+                    reads only""",
+            ),
+            _Switch(
+                ["-S", "S"],
+                """Treat paired-end reads
+                                    as single-end reads""",
+            ),
+            _Argument(
+                ["in_bam", "sorted_bam", "input_bam", "input", "input_file"],
+                "Name Sorted Alignment File ",
+                filename=True,
+                is_required=True,
+            ),
+            _Argument(
+                ["out_bam", "output_bam", "output", "output_file"],
+                "Output file",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsCalmdCommandline(AbstractCommandline):
+    """Command line wrapper for samtools calmd.
+
+    Generate the MD tag, equivalent to::
+
+    $ samtools calmd [-EeubSr] [-C capQcoef]  
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsCalmdCommandline
+    >>> input_bam = "/path/to/aln.bam"
+    >>> reference_fasta = "/path/to/reference.fasta"
+    >>> calmd_cmd = SamtoolsCalmdCommandline(input_bam=input_bam,
+    ...                                      reference=reference_fasta)
+    >>> print(calmd_cmd)
+    samtools calmd /path/to/aln.bam /path/to/reference.fasta
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("calmd"),
+            _Switch(
+                ["-E", "E"],
+                """Extended BAQ calculation.
+                    This option trades specificity for sensitivity,
+                    though the effect is minor.""",
+            ),
+            _Switch(
+                ["-e", "e"],
+                """Convert the read base to = if it is
+                    identical to the aligned reference base.
+
+                    Indel caller does not support the = bases
+                    at the moment.""",
+            ),
+            _Switch(["-u", "u"], "Output uncompressed BAM"),
+            _Switch(["-b", "b"], "Output compressed BAM "),
+            _Switch(["-S", "S"], "The input is SAM with header lines "),
+            _Switch(
+                ["-r", "r"],
+                """Compute the BQ tag (without -A)
+                    or cap base quality by BAQ (with -A).""",
+            ),
+            _Switch(
+                ["-A", "A"],
+                """When used jointly with -r this option overwrites
+                    the original base quality""",
+            ),
+            _Option(
+                ["-C", "C"],
+                """Coefficient to cap mapping quality
+                    of poorly mapped reads.
+
+                    See the pileup command for details.""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Argument(
+                ["input", "input_file", "in_bam", "infile", "input_bam"],
+                "Input BAM",
+                filename=True,
+                is_required=True,
+            ),
+            _Argument(
+                ["reference", "reference_fasta", "ref"],
+                "Reference FASTA to be indexed",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsTargetcutCommandline(AbstractCommandline):
+    """Command line wrapper for samtools targetcut.
+
+    This command identifies target regions by examining the continuity
+    of read depth, computes haploid consensus sequences of targets
+    and outputs a SAM with each sequence corresponding to a target,
+    equivalent to::
+
+        $ samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0]
+                             [-1 em1] [-2 em2] [-f ref] 
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsTargetcutCommandline
+    >>> input_bam = "/path/to/aln.bam"
+    >>> samtools_targetcut_cmd = SamtoolsTargetcutCommandline(input_bam=input_bam)
+    >>> print(samtools_targetcut_cmd)
+    samtools targetcut /path/to/aln.bam
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("targetcut"),
+            _Option(
+                ["-Q", "Q"],
+                "Minimum Base Quality ",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-i", "i"],
+                "Insertion Penalty",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-f", "f"],
+                "Reference Filename",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-0", "em0"],
+                "em0",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-1", "em1"],
+                "em1",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Option(
+                ["-2", "em2"],
+                "em2",
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Argument(
+                ["input", "input_bam", "in_bam"],
+                "Input file",
+                filename=True,
+                is_required=True,
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class SamtoolsPhaseCommandline(AbstractCommandline):
+    """Command line wrapper for samtools phase.
+
+    Call and phase heterozygous SNPs, equivalent to::
+
+        $ samtools phase [-AF] [-k len] [-b prefix]
+                         [-q minLOD] [-Q minBaseQ] 
+
+    See http://samtools.sourceforge.net/samtools.shtml for more details
+
+    Examples
+    --------
+    >>> from Bio.Sequencing.Applications import SamtoolsPhaseCommandline
+    >>> input_bam = "/path/to/in.bam"
+    >>> samtools_phase_cmd = SamtoolsPhaseCommandline(input_bam=input_bam)
+    >>> print(samtools_phase_cmd)
+    samtools phase /path/to/in.bam
+
+    """
+
+    def __init__(self, cmd="samtools", **kwargs):
+        """Initialize the class."""
+        self.program_name = cmd
+        self.parameters = [
+            _StaticArgument("phase"),
+            _Argument(
+                ["input", "input_bam", "in_bam"],
+                "Input file",
+                filename=True,
+                is_required=True,
+            ),
+            _Switch(["-A", "A"], "Drop reads with ambiguous phase"),
+            _Option(
+                ["-b", "b"],
+                "Prefix of BAM output",
+                filename=True,
+                equate=False,
+                checker_function=lambda x: isinstance(x, str),
+            ),
+            _Switch(["-F", "F"], "Do not attempt to fix chimeric reads"),
+            _Option(
+                ["-k", "k"],
+                "Maximum length for local phasing",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-q", "q"],
+                """Minimum Phred-scaled LOD to
+                    call a heterozygote""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+            _Option(
+                ["-Q", "Q"],
+                """Minimum base quality to be
+                    used in het calling""",
+                equate=False,
+                checker_function=lambda x: isinstance(x, int),
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/Sequencing/Phd.py b/code/lib/Bio/Sequencing/Phd.py
new file mode 100644
index 0000000..6e7cc8d
--- /dev/null
+++ b/code/lib/Bio/Sequencing/Phd.py
@@ -0,0 +1,199 @@
+# Copyright 2004 by Cymon J. Cox and Frank Kauff.  All rights reserved.
+# Copyright 2008 by Michiel de Hoon.  All rights reserved.
+# Revisions copyright 2009 by Cymon J. Cox.  All rights reserved.
+# Revisions copyright 2009 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Parser for PHD files output by PHRED and used by PHRAP and CONSED.
+
+This module can be used directly, which will return Record objects
+containing all the original data in the file.
+
+Alternatively, using Bio.SeqIO with the "phd" format will call this module
+internally.  This will give SeqRecord objects for each contig sequence.
+"""
+
+from Bio import Seq
+
+
+CKEYWORDS = [
+    "CHROMAT_FILE",
+    "ABI_THUMBPRINT",
+    "PHRED_VERSION",
+    "CALL_METHOD",
+    "QUALITY_LEVELS",
+    "TIME",
+    "TRACE_ARRAY_MIN_INDEX",
+    "TRACE_ARRAY_MAX_INDEX",
+    "TRIM",
+    "TRACE_PEAK_AREA_RATIO",
+    "CHEM",
+    "DYE",
+]
+
+
+class Record:
+    """Hold information from a PHD file."""
+
+    def __init__(self):
+        """Initialize the class."""
+        self.file_name = ""
+        self.comments = {}
+        for kw in CKEYWORDS:
+            self.comments[kw.lower()] = None
+        self.sites = []
+        self.seq = ""
+        self.seq_trimmed = ""
+
+
+def read(source):
+    """Read one PHD record from the file and return it as a Record object.
+
+    Argument source is a file-like object opened in text mode, or a path
+    to a file.
+
+    This function reads PHD file data line by line from the source, and
+    returns a single Record object. A ValueError is raised if more than
+    one record is found in the file.
+    """
+    handle = _open(source)
+    try:
+        record = _read(handle)
+        try:
+            next(handle)
+        except StopIteration:
+            return record
+        else:
+            raise ValueError("More than one PHD record found")
+    finally:
+        if handle is not source:
+            handle.close()
+
+
+def parse(source):
+    """Iterate over a file yielding multiple PHD records.
+
+    Argument source is a file-like object opened in text mode, or a path
+    to a file.
+
+    The data is read line by line from the source.
+
+    Typical usage::
+
+        records = parse(handle)
+        for record in records:
+            # do something with the record object
+
+    """
+    handle = _open(source)
+    try:
+        while True:
+            record = _read(handle)
+            if not record:
+                return
+            yield record
+    finally:
+        if handle is not source:
+            handle.close()
+
+
+# Everything below is considered private
+
+
+def _open(source):
+    try:
+        handle = open(source)
+    except TypeError:
+        handle = source
+        if handle.read(0) != "":
+            raise ValueError("PHD files must be opened in text mode.") from None
+    return handle
+
+
+def _read(handle):
+    for line in handle:
+        if line.startswith("BEGIN_SEQUENCE"):
+            record = Record()
+            record.file_name = line[15:].rstrip()
+            break
+    else:
+        return  # No record found
+
+    for line in handle:
+        if line.startswith("BEGIN_COMMENT"):
+            break
+    else:
+        raise ValueError("Failed to find BEGIN_COMMENT line")
+
+    for line in handle:
+        line = line.strip()
+        if not line:
+            continue
+        if line == "END_COMMENT":
+            break
+        keyword, value = line.split(":", 1)
+        keyword = keyword.lower()
+        value = value.strip()
+        if keyword in (
+            "chromat_file",
+            "phred_version",
+            "call_method",
+            "chem",
+            "dye",
+            "time",
+            "basecaller_version",
+            "trace_processor_version",
+        ):
+            record.comments[keyword] = value
+        elif keyword in (
+            "abi_thumbprint",
+            "quality_levels",
+            "trace_array_min_index",
+            "trace_array_max_index",
+        ):
+            record.comments[keyword] = int(value)
+        elif keyword == "trace_peak_area_ratio":
+            record.comments[keyword] = float(value)
+        elif keyword == "trim":
+            first, last, prob = value.split()
+            record.comments[keyword] = (int(first), int(last), float(prob))
+    else:
+        raise ValueError("Failed to find END_COMMENT line")
+
+    for line in handle:
+        if line.startswith("BEGIN_DNA"):
+            break
+    else:
+        raise ValueError("Failed to find BEGIN_DNA line")
+
+    for line in handle:
+        if line.startswith("END_DNA"):
+            break
+        else:
+            # Line is: "site quality peak_location"
+            # Peak location is optional according to
+            # David Gordon (the Consed author)
+            parts = line.split()
+            if len(parts) in [2, 3]:
+                record.sites.append(tuple(parts))
+            else:
+                raise ValueError(
+                    "DNA line must contain a base and quality "
+                    "score, and optionally a peak location."
+                )
+
+    for line in handle:
+        if line.startswith("END_SEQUENCE"):
+            break
+    else:
+        raise ValueError("Failed to find END_SEQUENCE line")
+
+    record.seq = Seq.Seq("".join(n[0] for n in record.sites))
+    if record.comments["trim"] is not None:
+        first, last = record.comments["trim"][:2]
+        record.seq_trimmed = record.seq[first:last]
+
+    return record
diff --git a/code/lib/Bio/Sequencing/__init__.py b/code/lib/Bio/Sequencing/__init__.py
new file mode 100644
index 0000000..927b866
--- /dev/null
+++ b/code/lib/Bio/Sequencing/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2004 Frank Kauff. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code to deal with various programs for sequencing and assembly.
+
+This code deals with programs such as Phred, Phrap and Consed -- which provide
+utilities for calling bases from sequencing reads, and assembling sequences
+into contigs.
+"""
diff --git a/code/lib/Bio/Sequencing/__pycache__/Ace.cpython-37.pyc b/code/lib/Bio/Sequencing/__pycache__/Ace.cpython-37.pyc
new file mode 100644
index 0000000..98990a6
Binary files /dev/null and b/code/lib/Bio/Sequencing/__pycache__/Ace.cpython-37.pyc differ
diff --git a/code/lib/Bio/Sequencing/__pycache__/Phd.cpython-37.pyc b/code/lib/Bio/Sequencing/__pycache__/Phd.cpython-37.pyc
new file mode 100644
index 0000000..89f4d63
Binary files /dev/null and b/code/lib/Bio/Sequencing/__pycache__/Phd.cpython-37.pyc differ
diff --git a/code/lib/Bio/Sequencing/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Sequencing/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..912e97a
Binary files /dev/null and b/code/lib/Bio/Sequencing/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/SubsMat/FreqTable.py b/code/lib/Bio/SubsMat/FreqTable.py
new file mode 100644
index 0000000..f0af932
--- /dev/null
+++ b/code/lib/Bio/SubsMat/FreqTable.py
@@ -0,0 +1,107 @@
+# Copyright 2000 by Iddo Friedberg idoerg@cc.huji.ac.il
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+r"""A class to handle frequency tables or letter count files.
+
+Example files for a DNA alphabet:
+
+A count file (whitespace separated)::
+
+ A  50
+ C  37
+ G  23
+ T  58
+
+The same info as a frequency file::
+
+ A 0.2976
+ C 0.2202
+ G 0.1369
+ T 0.3452
+
+Functions:
+  :read_count(f): read a count file from stream f. Then convert to
+                  frequencies.
+  :read_freq(f): read a frequency data file from stream f. Of course, we then
+                 don't have the counts, but it is usually the letter frequencies
+                 which are interesting.
+
+Methods:
+  (all internal)
+
+Attributes:
+  :alphabet: The letters you are using as indices into the table.
+  :data: Frequency dictionary.
+  :count: Count dictionary. Empty if no counts are provided.
+
+Example of use:
+    >>> import io
+    >>> from Bio.SubsMat import FreqTable
+    >>> f_count = io.StringIO(u"A  50\nC  37\nG  23\nT  58")
+    >>> ftab = FreqTable.read_count(f_count)
+    >>> for nb in sorted(ftab):
+    ...     print("%s %0.4f" %(nb, ftab[nb]))
+    ...
+    A 0.2976
+    C 0.2202
+    G 0.1369
+    T 0.3452
+
+"""
+
+
+COUNT = 1
+FREQ = 2
+
+
+class FreqTable(dict):
+    """Define class to handle frequency tables or letter count files."""
+
+    def _freq_from_count(self):
+        """Calculate frequency from count values (PRIVATE)."""
+        total = float(sum(self.count.values()))
+        for i, v in self.count.items():
+            self[i] = v / total
+
+    def _alphabet_from_input(self):
+        """Order the alphabet (PRIVATE)."""
+        s = ""
+        for i in sorted(self):
+            s += i
+        return s
+
+    def __init__(self, in_dict, dict_type, alphabet=None):
+        """Initialize the class."""
+        self.alphabet = alphabet
+        if dict_type == COUNT:
+            self.count = in_dict
+            self._freq_from_count()
+        elif dict_type == FREQ:
+            self.count = {}
+            self.update(in_dict)
+        else:
+            raise ValueError("bad dict_type")
+        if not alphabet:
+            self.alphabet = self._alphabet_from_input()
+
+
+def read_count(f):
+    """Read a count file f and load values to the Frequency Table."""
+    count = {}
+    for line in f:
+        key, value = line.strip().split()
+        count[key] = int(value)
+    return FreqTable(count, COUNT)
+
+
+def read_freq(f):
+    """Read a frequency data file f and load values to the Frequency Table."""
+    freq_dict = {}
+    for line in f:
+        key, value = line.strip().split()
+        freq_dict[key] = float(value)
+    return FreqTable(freq_dict, FREQ)
diff --git a/code/lib/Bio/SubsMat/MatrixInfo.py b/code/lib/Bio/SubsMat/MatrixInfo.py
new file mode 100644
index 0000000..fc86cb0
--- /dev/null
+++ b/code/lib/Bio/SubsMat/MatrixInfo.py
@@ -0,0 +1,2724 @@
+# Copyright 2000 by Iddo Friedberg
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Substitution matrices for use in alignments, etc.
+
+The information on this page was originally obtained from Dr. Gerhard Vogt's
+page http://www.embl-heidelberg.de/~vogt/matrices/mlist1.html (dead link),
+and was extracted using a script.
+
+You can view an archive copy of this webpage from 1999 here;
+https://web.archive.org/web/19991014010917/http://www.embl-heidelberg.de/%7Evogt/matrices/mlist1.html
+"""
+
+# The data rich dictionaries do not lend themselves to black style, turn it off:
+# fmt: off
+
+# a list of all available substitution matrices
+available_matrices = ["benner6", "benner22", "benner74", "blosum100",
+                      "blosum30", "blosum35", "blosum40", "blosum45",
+                      "blosum50", "blosum55", "blosum60", "blosum62",
+                      "blosum65", "blosum70", "blosum75", "blosum80",
+                      "blosum85", "blosum90", "blosum95", "feng",
+                      "fitch", "genetic", "gonnet", "grant",
+                      "ident", "johnson", "levin", "mclach",
+                      "miyata", "nwsgappep", "pam120", "pam180",
+                      "pam250", "pam30", "pam300", "pam60",
+                      "pam90", "rao", "risler", "structure",
+                      ]
+
+# http://www.embl-heidelberg.de/~vogt/matrices/benner6.cmp
+benner6 = {
+    ("W", "F"): -1.6, ("L", "R"): -3.2, ("I", "I"): 4.4, ("Q", "Q"): 5.3,
+    ("W", "N"): -4.4, ("V", "I"): 3.9, ("H", "T"): -1.7, ("H", "P"): -0.4,
+    ("W", "V"): -4.8, ("Q", "E"): 2.1, ("W", "R"): 2.0, ("Q", "A"): -1.7,
+    ("H", "H"): 6.1, ("H", "D"): 0.1, ("L", "N"): -3.4, ("Y", "M"): -3.6,
+    ("Y", "I"): -3.3, ("Y", "E"): -4.1, ("E", "S"): -1.2, ("Y", "A"): -4.0,
+    ("Y", "Y"): 9.5, ("T", "C"): -1.5, ("E", "C"): -4.7, ("Y", "Q"): -1.4,
+    ("E", "G"): 0.5, ("V", "A"): 0.7, ("C", "C"): 12.1, ("M", "R"): -3.0,
+    ("P", "T"): 0.6, ("V", "E"): -3.0, ("P", "P"): 6.5, ("I", "T"): 0.7,
+    ("K", "S"): -1.2, ("R", "G"): -0.1, ("I", "P"): -2.0, ("R", "C"): -0.4,
+    ("A", "T"): 1.7, ("K", "K"): 5.6, ("A", "P"): 1.1, ("V", "M"): 3.3,
+    ("I", "D"): -4.2, ("K", "C"): -2.8, ("K", "G"): -1.4, ("R", "S"): -0.9,
+    ("F", "Q"): -4.4, ("F", "A"): -3.2, ("V", "V"): 4.0, ("M", "N"): -2.5,
+    ("F", "E"): -6.7, ("D", "N"): 2.5, ("F", "I"): 0.0, ("F", "M"): -0.1,
+    ("M", "S"): -1.3, ("S", "S"): 2.1, ("L", "Q"): -2.4, ("W", "E"): -5.6,
+    ("W", "A"): -4.3, ("W", "M"): -4.4, ("H", "S"): -0.9, ("W", "I"): -5.0,
+    ("S", "C"): 0.9, ("L", "A"): -1.3, ("L", "E"): -5.0, ("W", "Q"): -2.6,
+    ("H", "G"): -2.1, ("Q", "N"): 0.1, ("H", "C"): -1.2, ("L", "M"): -2.9,
+    ("W", "Y"): -0.3, ("Y", "N"): -0.9, ("E", "P"): -2.6, ("Y", "F"): 5.6,
+    ("E", "T"): -1.6, ("A", "A"): 2.5, ("I", "N"): -2.5, ("G", "A"): 0.8,
+    ("Y", "V"): -3.8, ("E", "D"): 4.4, ("W", "H"): -2.8, ("Y", "R"): -2.6,
+    ("M", "Q"): -3.1, ("P", "S"): 1.4, ("R", "H"): 1.8, ("A", "C"): -1.7,
+    ("R", "D"): -1.5, ("K", "P"): -2.3, ("L", "D"): -5.3, ("K", "T"): -1.1,
+    ("V", "N"): -2.4, ("M", "A"): -0.2, ("K", "H"): 0.9, ("V", "R"): -3.7,
+    ("P", "C"): -2.7, ("M", "E"): -4.1, ("A", "S"): 1.4, ("T", "T"): 2.4,
+    ("R", "T"): -1.3, ("I", "G"): -3.4, ("R", "P"): -1.3, ("K", "D"): -0.2,
+    ("I", "C"): -3.6, ("F", "R"): -4.9, ("F", "V"): -0.5, ("L", "C"): -3.8,
+    ("F", "F"): 8.3, ("D", "A"): -0.6, ("F", "N"): -3.5, ("W", "D"): -6.3,
+    ("L", "P"): -0.2, ("Q", "S"): -1.4, ("N", "C"): -1.6, ("N", "G"): -0.1,
+    ("H", "N"): 1.4, ("W", "T"): -2.6, ("Q", "G"): -1.6, ("W", "P"): -4.8,
+    ("Q", "C"): -3.2, ("N", "S"): 1.2, ("L", "H"): -2.2, ("L", "L"): 4.8,
+    ("G", "T"): -0.5, ("M", "M"): 4.8, ("G", "P"): -1.7, ("Y", "K"): -4.0,
+    ("Y", "G"): -4.9, ("Y", "C"): 2.6, ("E", "A"): -0.7, ("E", "E"): 5.2,
+    ("Y", "S"): -1.8, ("M", "P"): -1.8, ("V", "C"): -3.1, ("M", "T"): 0.6,
+    ("V", "G"): -2.3, ("R", "E"): -0.4, ("V", "K"): -3.8, ("K", "Q"): 2.5,
+    ("R", "A"): -1.7, ("I", "R"): -3.8, ("N", "A"): 0.0, ("V", "S"): -0.9,
+    ("M", "D"): -4.3, ("M", "H"): -3.4, ("K", "A"): -1.9, ("R", "Q"): 2.5,
+    ("K", "E"): 0.9, ("F", "S"): -1.8, ("I", "K"): -3.8, ("D", "P"): -2.8,
+    ("D", "T"): -1.2, ("I", "M"): 4.0, ("F", "C"): -0.1, ("W", "L"): -3.0,
+    ("F", "G"): -5.7, ("F", "K"): -6.3, ("F", "T"): -2.4, ("D", "D"): 5.2,
+    ("Q", "T"): -1.7, ("W", "G"): -1.7, ("Q", "P"): 0.1, ("W", "C"): 1.6,
+    ("W", "K"): -1.4, ("H", "Q"): 3.2, ("Q", "D"): 0.6, ("W", "W"): 14.7,
+    ("V", "L"): 1.9, ("L", "G"): -4.6, ("W", "S"): -2.9, ("L", "K"): -4.1,
+    ("N", "P"): -1.1, ("H", "E"): -0.2, ("N", "T"): 0.5, ("H", "A"): -2.1,
+    ("Y", "L"): -1.6, ("Y", "H"): 4.4, ("G", "S"): 0.8, ("Y", "D"): -2.3,
+    ("V", "Q"): -3.5, ("L", "T"): -0.4, ("G", "G"): 5.8, ("G", "C"): -1.3,
+    ("E", "N"): 1.1, ("Y", "T"): -3.4, ("Y", "P"): -3.8, ("R", "N"): -0.1,
+    ("V", "D"): -3.3, ("K", "R"): 4.3, ("V", "H"): -3.8, ("I", "Q"): -3.8,
+    ("V", "P"): -1.6, ("M", "C"): -3.7, ("K", "N"): 1.0, ("V", "T"): 0.6,
+    ("M", "G"): -3.7, ("T", "S"): 1.5, ("I", "E"): -4.1, ("M", "K"): -2.9,
+    ("I", "A"): 0.1, ("N", "N"): 3.6, ("R", "R"): 5.1, ("F", "P"): -3.2,
+    ("L", "I"): 2.4, ("I", "S"): -1.2, ("D", "S"): -0.4, ("L", "S"): -1.5,
+    ("I", "H"): -3.7, ("F", "D"): -5.7, ("D", "C"): -3.7, ("F", "H"): 0.1,
+    ("D", "G"): 0.8, ("F", "L"): 2.4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/benner22.cmp
+benner22 = {
+    ("W", "F"): 0.5, ("L", "R"): -2.9, ("I", "I"): 4.2, ("Q", "Q"): 4.2,
+    ("W", "N"): -5.2, ("V", "I"): 3.6, ("H", "T"): -1.1, ("H", "P"): -0.4,
+    ("W", "V"): -4.5, ("Q", "E"): 1.7, ("W", "R"): -1.1, ("Q", "A"): -0.9,
+    ("H", "H"): 6.1, ("H", "D"): 0.3, ("L", "N"): -3.5, ("Y", "M"): -1.8,
+    ("Y", "I"): -2.2, ("Y", "E"): -4.0, ("E", "S"): -0.5, ("Y", "A"): -3.5,
+    ("Y", "Y"): 9.0, ("T", "C"): -1.1, ("E", "C"): -4.3, ("Y", "Q"): -1.9,
+    ("E", "G"): 0.5, ("V", "A"): 0.4, ("C", "C"): 12.6, ("M", "R"): -2.1,
+    ("P", "T"): 0.4, ("V", "E"): -2.7, ("P", "P"): 7.0, ("I", "T"): 0.3,
+    ("K", "S"): -0.4, ("R", "G"): -0.7, ("I", "P"): -2.3, ("R", "C"): -1.6,
+    ("A", "T"): 1.4, ("K", "K"): 4.4, ("A", "P"): 0.8, ("V", "M"): 2.5,
+    ("I", "D"): -4.0, ("K", "C"): -3.3, ("K", "G"): -1.0, ("R", "S"): -0.5,
+    ("F", "Q"): -3.6, ("F", "A"): -3.1, ("V", "V"): 3.7, ("M", "N"): -2.6,
+    ("F", "E"): -5.7, ("D", "N"): 2.4, ("F", "I"): 0.5, ("F", "M"): 0.7,
+    ("M", "S"): -1.5, ("S", "S"): 2.0, ("L", "Q"): -2.0, ("W", "E"): -6.3,
+    ("W", "A"): -5.5, ("W", "M"): -2.8, ("H", "S"): -0.5, ("W", "I"): -4.4,
+    ("S", "C"): 0.3, ("L", "A"): -1.7, ("L", "E"): -4.4, ("W", "Q"): -3.3,
+    ("H", "G"): -2.0, ("Q", "N"): 0.5, ("H", "C"): -1.5, ("L", "M"): 3.2,
+    ("W", "Y"): 1.5, ("Y", "N"): -1.2, ("E", "P"): -1.7, ("Y", "F"): 5.9,
+    ("E", "T"): -0.9, ("A", "A"): 2.5, ("I", "N"): -2.7, ("G", "A"): 0.8,
+    ("Y", "V"): -2.6, ("E", "D"): 3.9, ("W", "H"): -2.7, ("Y", "R"): -2.7,
+    ("M", "Q"): -1.7, ("P", "S"): 1.1, ("R", "H"): 1.5, ("A", "C"): -1.2,
+    ("R", "D"): -1.0, ("K", "P"): -1.6, ("L", "D"): -4.9, ("K", "T"): -0.4,
+    ("V", "N"): -2.3, ("M", "A"): -0.8, ("K", "H"): 0.8, ("V", "R"): -2.9,
+    ("P", "C"): -3.1, ("M", "E"): -3.4, ("A", "S"): 1.3, ("T", "T"): 2.5,
+    ("R", "T"): -0.7, ("I", "G"): -3.8, ("R", "P"): -1.2, ("K", "D"): 0.2,
+    ("I", "C"): -2.4, ("F", "R"): -4.3, ("F", "V"): -0.1, ("L", "C"): -2.6,
+    ("F", "F"): 7.7, ("D", "A"): -0.2, ("F", "N"): -3.5, ("W", "D"): -6.4,
+    ("L", "P"): -1.3, ("Q", "S"): -0.6, ("N", "C"): -1.9, ("N", "G"): 0.4,
+    ("H", "N"): 1.4, ("W", "T"): -4.5, ("Q", "G"): -1.4, ("W", "P"): -5.8,
+    ("Q", "C"): -3.3, ("N", "S"): 1.1, ("L", "H"): -2.1, ("L", "L"): 4.6,
+    ("G", "T"): -0.7, ("M", "M"): 4.9, ("G", "P"): -1.8, ("Y", "K"): -3.6,
+    ("Y", "G"): -4.8, ("Y", "C"): 0.6, ("E", "A"): -0.3, ("E", "E"): 4.6,
+    ("Y", "S"): -1.9, ("M", "P"): -2.0, ("V", "C"): -1.7, ("M", "T"): 0.1,
+    ("V", "G"): -2.5, ("R", "E"): -0.1, ("V", "K"): -2.7, ("K", "Q"): 2.2,
+    ("R", "A"): -1.2, ("I", "R"): -3.2, ("N", "A"): 0.0, ("V", "S"): -0.9,
+    ("M", "D"): -3.9, ("M", "H"): -2.4, ("K", "A"): -1.0, ("R", "Q"): 2.2,
+    ("K", "E"): 1.0, ("F", "S"): -2.2, ("I", "K"): -3.0, ("D", "P"): -1.8,
+    ("D", "T"): -0.7, ("I", "M"): 3.1, ("F", "C"): -0.1, ("W", "L"): -1.8,
+    ("F", "G"): -5.8, ("F", "K"): -5.1, ("F", "T"): -2.6, ("D", "D"): 4.8,
+    ("Q", "T"): -0.7, ("W", "G"): -4.5, ("Q", "P"): -0.1, ("W", "C"): 0.5,
+    ("W", "K"): -3.7, ("H", "Q"): 2.4, ("Q", "D"): 0.6, ("W", "W"): 15.7,
+    ("V", "L"): 2.0, ("L", "G"): -4.9, ("W", "S"): -3.9, ("L", "K"): -3.3,
+    ("N", "P"): -1.1, ("H", "E"): -0.2, ("N", "T"): 0.5, ("H", "A"): -1.6,
+    ("Y", "L"): -0.7, ("Y", "H"): 3.7, ("G", "S"): 0.6, ("Y", "D"): -3.0,
+    ("V", "Q"): -2.4, ("L", "T"): -1.0, ("G", "G"): 6.2, ("G", "C"): -1.7,
+    ("E", "N"): 1.2, ("Y", "T"): -3.0, ("Y", "P"): -3.5, ("R", "N"): 0.4,
+    ("V", "D"): -3.0, ("K", "R"): 3.9, ("V", "H"): -3.0, ("I", "Q"): -2.7,
+    ("V", "P"): -1.7, ("M", "C"): -2.5, ("K", "N"): 1.0, ("V", "T"): 0.4,
+    ("M", "G"): -3.8, ("T", "S"): 1.5, ("I", "E"): -3.6, ("M", "K"): -2.0,
+    ("I", "A"): -0.4, ("N", "N"): 3.3, ("R", "R"): 5.0, ("F", "P"): -3.4,
+    ("L", "I"): 2.7, ("I", "S"): -1.4, ("D", "S"): 0.1, ("L", "S"): -2.1,
+    ("I", "H"): -3.2, ("F", "D"): -5.4, ("D", "C"): -3.7, ("F", "H"): 0.3,
+    ("D", "G"): 0.7, ("F", "L"): 2.2
+}
+
+
+assert benner6 != benner22
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/benner74.cmp
+benner74 = {
+    ("W", "F"): 3.0, ("L", "R"): -2.4, ("I", "I"): 4.0, ("Q", "Q"): 3.0,
+    ("W", "N"): -4.0, ("V", "I"): 3.2, ("H", "T"): -0.5, ("H", "P"): -1.0,
+    ("W", "V"): -2.9, ("Q", "E"): 1.7, ("W", "R"): -1.6, ("Q", "A"): -0.3,
+    ("H", "H"): 6.1, ("H", "D"): 0.4, ("L", "N"): -3.1, ("Y", "M"): -0.5,
+    ("Y", "I"): -1.0, ("Y", "E"): -3.0, ("E", "S"): 0.1, ("Y", "A"): -2.6,
+    ("Y", "Y"): 8.1, ("T", "C"): -0.6, ("E", "C"): -3.2, ("Y", "Q"): -1.8,
+    ("E", "G"): -0.5, ("V", "A"): 0.1, ("C", "C"): 11.8, ("M", "R"): -1.8,
+    ("P", "T"): 0.1, ("V", "E"): -2.1, ("P", "P"): 7.5, ("I", "T"): -0.3,
+    ("K", "S"): 0.0, ("R", "G"): -1.0, ("I", "P"): -2.6, ("R", "C"): -2.2,
+    ("A", "T"): 0.7, ("K", "K"): 3.4, ("A", "P"): 0.4, ("V", "M"): 1.8,
+    ("I", "D"): -3.9, ("K", "C"): -2.9, ("K", "G"): -1.1, ("R", "S"): -0.2,
+    ("F", "Q"): -2.8, ("F", "A"): -2.6, ("V", "V"): 3.4, ("M", "N"): -2.2,
+    ("F", "E"): -4.3, ("D", "N"): 2.2, ("F", "I"): 0.9, ("F", "M"): 1.3,
+    ("M", "S"): -1.4, ("S", "S"): 2.1, ("L", "Q"): -1.7, ("W", "E"): -4.7,
+    ("W", "A"): -4.1, ("W", "M"): -1.3, ("H", "S"): -0.3, ("W", "I"): -2.3,
+    ("S", "C"): 0.1, ("L", "A"): -1.4, ("L", "E"): -3.1, ("W", "Q"): -2.8,
+    ("H", "G"): -1.6, ("Q", "N"): 0.7, ("H", "C"): -1.3, ("L", "M"): 2.9,
+    ("W", "Y"): 3.6, ("Y", "N"): -1.4, ("E", "P"): -0.7, ("Y", "F"): 5.3,
+    ("E", "T"): -0.2, ("A", "A"): 2.4, ("I", "N"): -2.8, ("G", "A"): 0.6,
+    ("Y", "V"): -1.4, ("E", "D"): 2.9, ("W", "H"): -1.0, ("Y", "R"): -2.0,
+    ("M", "Q"): -1.0, ("P", "S"): 0.5, ("R", "H"): 1.0, ("A", "C"): 0.3,
+    ("R", "D"): -0.5, ("K", "P"): -0.8, ("L", "D"): -4.2, ("K", "T"): 0.1,
+    ("V", "N"): -2.2, ("M", "A"): -0.8, ("K", "H"): 0.6, ("V", "R"): -2.2,
+    ("P", "C"): -3.1, ("M", "E"): -2.2, ("A", "S"): 1.1, ("T", "T"): 2.5,
+    ("R", "T"): -0.3, ("I", "G"): -4.3, ("R", "P"): -0.1, ("K", "D"): 0.4,
+    ("I", "C"): -1.2, ("F", "R"): -3.5, ("F", "V"): 0.1, ("L", "C"): -1.6,
+    ("F", "F"): 7.2, ("D", "A"): -0.3, ("F", "N"): -3.2, ("W", "D"): -5.5,
+    ("L", "P"): -2.2, ("Q", "S"): 0.1, ("N", "C"): -1.8, ("N", "G"): 0.4,
+    ("H", "N"): 1.2, ("W", "T"): -3.7, ("Q", "G"): -1.1, ("W", "P"): -5.2,
+    ("Q", "C"): -2.6, ("N", "S"): 0.9, ("L", "H"): -1.9, ("L", "L"): 4.2,
+    ("G", "T"): -1.0, ("M", "M"): 4.5, ("G", "P"): -1.7, ("Y", "K"): -2.4,
+    ("Y", "G"): -4.3, ("Y", "C"): -0.4, ("E", "A"): -0.1, ("E", "E"): 3.7,
+    ("Y", "S"): -1.9, ("M", "P"): -2.4, ("V", "C"): -0.2, ("M", "T"): -0.4,
+    ("V", "G"): -3.1, ("R", "E"): 0.3, ("V", "K"): -1.9, ("K", "Q"): 1.7,
+    ("R", "A"): -0.8, ("I", "R"): -2.6, ("N", "A"): -0.2, ("V", "S"): -1.0,
+    ("M", "D"): -3.2, ("M", "H"): -1.5, ("K", "A"): -0.4, ("R", "Q"): 1.6,
+    ("K", "E"): 1.2, ("F", "S"): -2.6, ("I", "K"): -2.3, ("D", "P"): -1.0,
+    ("D", "T"): -0.2, ("I", "M"): 2.6, ("F", "C"): -0.7, ("W", "L"): -0.9,
+    ("F", "G"): -5.4, ("F", "K"): -3.6, ("F", "T"): -2.2, ("D", "D"): 4.8,
+    ("Q", "T"): -0.1, ("W", "G"): -4.1, ("Q", "P"): -0.2, ("W", "C"): -0.9,
+    ("W", "K"): -3.6, ("H", "Q"): 1.4, ("Q", "D"): 0.8, ("W", "W"): 14.7,
+    ("V", "L"): 1.9, ("L", "G"): -4.6, ("W", "S"): -3.4, ("L", "K"): -2.4,
+    ("N", "P"): -1.0, ("H", "E"): 0.2, ("N", "T"): 0.4, ("H", "A"): -1.0,
+    ("Y", "L"): -0.1, ("Y", "H"): 2.5, ("G", "S"): 0.4, ("Y", "D"): -2.8,
+    ("V", "Q"): -1.7, ("L", "T"): -1.1, ("G", "G"): 6.6, ("G", "C"): -2.0,
+    ("E", "N"): 1.0, ("Y", "T"): -2.1, ("Y", "P"): -3.4, ("R", "N"): 0.3,
+    ("V", "D"): -2.9, ("K", "R"): 2.9, ("V", "H"): -2.1, ("I", "Q"): -2.0,
+    ("V", "P"): -1.9, ("M", "C"): -1.2, ("K", "N"): 0.9, ("V", "T"): 0.2,
+    ("M", "G"): -3.5, ("T", "S"): 1.4, ("I", "E"): -2.9, ("M", "K"): -1.5,
+    ("I", "A"): -0.8, ("N", "N"): 3.6, ("R", "R"): 4.8, ("F", "P"): -3.8,
+    ("L", "I"): 2.8, ("I", "S"): -1.8, ("D", "S"): 0.4, ("L", "S"): -2.2,
+    ("I", "H"): -2.3, ("F", "D"): -4.7, ("D", "C"): -3.2, ("F", "H"): 0.0,
+    ("D", "G"): 0.2, ("F", "L"): 2.1
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum100.cmp
+blosum100 = {
+    ("W", "F"): 0, ("L", "R"): -4, ("S", "P"): -2, ("V", "T"): -1,
+    ("Q", "Q"): 7, ("N", "A"): -2, ("Z", "Y"): -4, ("W", "R"): -4,
+    ("Q", "A"): -1, ("S", "D"): -1, ("H", "H"): 9, ("S", "H"): -2,
+    ("H", "D"): -2, ("L", "N"): -5, ("W", "A"): -4, ("Y", "M"): -3,
+    ("G", "R"): -4, ("Y", "I"): -3, ("Y", "E"): -4, ("B", "Y"): -4,
+    ("Y", "A"): -4, ("V", "D"): -5, ("B", "S"): -1, ("Y", "Y"): 8,
+    ("G", "N"): -2, ("E", "C"): -6, ("Y", "Q"): -3, ("Z", "Z"): 4,
+    ("V", "A"): -1, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): -1, ("P", "P"): 8, ("V", "I"): 2, ("V", "S"): -3,
+    ("Z", "P"): -3, ("V", "M"): 0, ("T", "F"): -3, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -3, ("I", "H"): -5, ("I", "D"): -6,
+    ("T", "R"): -2, ("P", "L"): -4, ("K", "G"): -3, ("M", "N"): -4,
+    ("P", "H"): -3, ("F", "Q"): -4, ("Z", "G"): -4, ("X", "L"): -2,
+    ("T", "M"): -2, ("Z", "C"): -6, ("X", "H"): -2, ("D", "R"): -3,
+    ("B", "W"): -6, ("X", "D"): -3, ("Z", "K"): 0, ("F", "A"): -4,
+    ("Z", "W"): -4, ("F", "E"): -5, ("D", "N"): 1, ("B", "K"): -1,
+    ("X", "X"): -2, ("F", "I"): -1, ("B", "G"): -2, ("X", "T"): -1,
+    ("F", "M"): -1, ("B", "C"): -5, ("Z", "I"): -4, ("Z", "V"): -3,
+    ("S", "S"): 6, ("L", "Q"): -3, ("W", "E"): -5, ("Q", "R"): 0,
+    ("N", "N"): 7, ("W", "M"): -3, ("Q", "C"): -5, ("W", "I"): -4,
+    ("S", "C"): -2, ("L", "A"): -3, ("S", "G"): -1, ("L", "E"): -5,
+    ("W", "Q"): -3, ("H", "G"): -4, ("S", "K"): -1, ("Q", "N"): -1,
+    ("N", "R"): -1, ("H", "C"): -5, ("Y", "N"): -3, ("G", "Q"): -3,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 0, ("G", "E"): -4,
+    ("G", "A"): -1, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -3,
+    ("M", "Q"): -1, ("T", "I"): -2, ("C", "D"): -5, ("V", "F"): -2,
+    ("T", "A"): -1, ("T", "P"): -3, ("B", "P"): -3, ("T", "E"): -2,
+    ("V", "N"): -4, ("P", "G"): -4, ("M", "A"): -2, ("K", "H"): -2,
+    ("V", "R"): -4, ("P", "C"): -5, ("M", "E"): -4, ("K", "L"): -4,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): -2, ("I", "G"): -6,
+    ("P", "K"): -2, ("M", "M"): 8, ("K", "D"): -2, ("I", "C"): -2,
+    ("Z", "D"): 0, ("F", "R"): -4, ("X", "K"): -2, ("Q", "D"): -2,
+    ("X", "G"): -3, ("Z", "L"): -4, ("X", "C"): -3, ("Z", "H"): -1,
+    ("B", "L"): -5, ("B", "H"): -1, ("F", "F"): 7, ("X", "W"): -4,
+    ("B", "D"): 4, ("D", "A"): -3, ("S", "L"): -4, ("X", "S"): -1,
+    ("F", "N"): -5, ("S", "R"): -2, ("W", "D"): -7, ("V", "Y"): -3,
+    ("W", "L"): -4, ("H", "R"): -1, ("W", "H"): -3, ("H", "N"): 0,
+    ("W", "T"): -5, ("T", "T"): 6, ("S", "F"): -3, ("W", "P"): -6,
+    ("L", "D"): -6, ("B", "I"): -5, ("L", "H"): -4, ("S", "N"): 0,
+    ("B", "T"): -2, ("L", "L"): 5, ("Y", "K"): -4, ("E", "Q"): 1,
+    ("Y", "G"): -6, ("Z", "S"): -1, ("Y", "C"): -4, ("G", "D"): -3,
+    ("B", "V"): -5, ("E", "A"): -2, ("Y", "W"): 1, ("E", "E"): 6,
+    ("Y", "S"): -3, ("C", "N"): -4, ("V", "C"): -2, ("T", "H"): -3,
+    ("P", "R"): -3, ("V", "G"): -5, ("T", "L"): -3, ("V", "K"): -4,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -4, ("T", "D"): -2,
+    ("P", "F"): -5, ("I", "N"): -5, ("K", "I"): -4, ("M", "D"): -5,
+    ("V", "W"): -4, ("W", "W"): 11, ("M", "H"): -3, ("P", "N"): -4,
+    ("K", "A"): -2, ("M", "L"): 2, ("K", "E"): 0, ("Z", "E"): 5,
+    ("X", "N"): -2, ("Z", "A"): -2, ("Z", "M"): -3, ("X", "F"): -3,
+    ("K", "C"): -5, ("B", "Q"): -1, ("X", "B"): -2, ("B", "M"): -4,
+    ("F", "C"): -3, ("Z", "Q"): 3, ("X", "Z"): -2, ("F", "G"): -5,
+    ("B", "E"): 0, ("X", "V"): -2, ("F", "K"): -4, ("B", "A"): -3,
+    ("X", "R"): -2, ("D", "D"): 7, ("W", "G"): -5, ("Z", "F"): -5,
+    ("S", "Q"): -1, ("W", "C"): -5, ("W", "K"): -5, ("H", "Q"): 0,
+    ("L", "C"): -3, ("W", "N"): -6, ("S", "A"): 1, ("L", "G"): -5,
+    ("W", "S"): -4, ("S", "E"): -1, ("H", "E"): -1, ("S", "I"): -4,
+    ("H", "A"): -3, ("S", "M"): -3, ("Y", "L"): -3, ("Y", "H"): 1,
+    ("Y", "D"): -5, ("E", "R"): -2, ("X", "P"): -3, ("G", "G"): 6,
+    ("G", "C"): -5, ("E", "N"): -1, ("Y", "T"): -3, ("Y", "P"): -5,
+    ("T", "K"): -2, ("A", "A"): 5, ("P", "Q"): -2, ("T", "C"): -2,
+    ("V", "H"): -5, ("T", "G"): -3, ("I", "Q"): -4, ("Z", "T"): -2,
+    ("C", "R"): -5, ("V", "P"): -4, ("P", "E"): -3, ("M", "C"): -3,
+    ("K", "N"): -1, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -5,
+    ("T", "S"): 1, ("I", "E"): -5, ("P", "M"): -4, ("M", "K"): -2,
+    ("I", "A"): -3, ("P", "I"): -4, ("R", "R"): 7, ("X", "M"): -2,
+    ("L", "I"): 1, ("X", "I"): -2, ("Z", "B"): 1, ("X", "E"): -2,
+    ("Z", "N"): -1, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 4,
+    ("F", "D"): -5, ("X", "Y"): -3, ("Z", "R"): -1, ("F", "H"): -2,
+    ("B", "F"): -5, ("F", "L"): 0, ("X", "Q"): -2, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum30.cmp
+blosum30 = {
+    ("W", "F"): 1, ("L", "R"): -2, ("S", "P"): -1, ("V", "T"): 1,
+    ("Q", "Q"): 8, ("N", "A"): 0, ("Z", "Y"): -2, ("W", "R"): 0,
+    ("Q", "A"): 1, ("S", "D"): 0, ("H", "H"): 14, ("S", "H"): -1,
+    ("H", "D"): -2, ("L", "N"): -2, ("W", "A"): -5, ("Y", "M"): -1,
+    ("G", "R"): -2, ("Y", "I"): -1, ("Y", "E"): -2, ("B", "Y"): -3,
+    ("Y", "A"): -4, ("V", "D"): -2, ("B", "S"): 0, ("Y", "Y"): 9,
+    ("G", "N"): 0, ("E", "C"): 1, ("Y", "Q"): -1, ("Z", "Z"): 4,
+    ("V", "A"): 1, ("C", "C"): 17, ("M", "R"): 0, ("V", "E"): -3,
+    ("T", "N"): 1, ("P", "P"): 11, ("V", "I"): 4, ("V", "S"): -1,
+    ("Z", "P"): 0, ("V", "M"): 0, ("T", "F"): -2, ("V", "Q"): -3,
+    ("K", "K"): 4, ("P", "D"): -1, ("I", "H"): -2, ("I", "D"): -4,
+    ("T", "R"): -3, ("P", "L"): -3, ("K", "G"): -1, ("M", "N"): 0,
+    ("P", "H"): 1, ("F", "Q"): -3, ("Z", "G"): -2, ("X", "L"): 0,
+    ("T", "M"): 0, ("Z", "C"): 0, ("X", "H"): -1, ("D", "R"): -1,
+    ("B", "W"): -5, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -2,
+    ("Z", "W"): -1, ("F", "E"): -4, ("D", "N"): 1, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): 0, ("X", "T"): 0,
+    ("F", "M"): -2, ("B", "C"): -2, ("Z", "I"): -3, ("Z", "V"): -3,
+    ("S", "S"): 4, ("L", "Q"): -2, ("W", "E"): -1, ("Q", "R"): 3,
+    ("N", "N"): 8, ("W", "M"): -3, ("Q", "C"): -2, ("W", "I"): -3,
+    ("S", "C"): -2, ("L", "A"): -1, ("S", "G"): 0, ("L", "E"): -1,
+    ("W", "Q"): -1, ("H", "G"): -3, ("S", "K"): 0, ("Q", "N"): -1,
+    ("N", "R"): -2, ("H", "C"): -5, ("Y", "N"): -4, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): -3, ("V", "L"): 1, ("G", "E"): -2,
+    ("G", "A"): 0, ("K", "R"): 1, ("E", "D"): 1, ("Y", "R"): 0,
+    ("M", "Q"): -1, ("T", "I"): 0, ("C", "D"): -3, ("V", "F"): 1,
+    ("T", "A"): 1, ("T", "P"): 0, ("B", "P"): -2, ("T", "E"): -2,
+    ("V", "N"): -2, ("P", "G"): -1, ("M", "A"): 1, ("K", "H"): -2,
+    ("V", "R"): -1, ("P", "C"): -3, ("M", "E"): -1, ("K", "L"): -2,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): 0, ("I", "G"): -1,
+    ("P", "K"): 1, ("M", "M"): 6, ("K", "D"): 0, ("I", "C"): -2,
+    ("Z", "D"): 0, ("F", "R"): -1, ("X", "K"): 0, ("Q", "D"): -1,
+    ("X", "G"): -1, ("Z", "L"): -1, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -1, ("B", "H"): -2, ("F", "F"): 10, ("X", "W"): -2,
+    ("B", "D"): 5, ("D", "A"): 0, ("S", "L"): -2, ("X", "S"): 0,
+    ("F", "N"): -1, ("S", "R"): -1, ("W", "D"): -4, ("V", "Y"): 1,
+    ("W", "L"): -2, ("H", "R"): -1, ("W", "H"): -5, ("H", "N"): -1,
+    ("W", "T"): -5, ("T", "T"): 5, ("S", "F"): -1, ("W", "P"): -3,
+    ("L", "D"): -1, ("B", "I"): -2, ("L", "H"): -1, ("S", "N"): 0,
+    ("B", "T"): 0, ("L", "L"): 4, ("Y", "K"): -1, ("E", "Q"): 2,
+    ("Y", "G"): -3, ("Z", "S"): -1, ("Y", "C"): -6, ("G", "D"): -1,
+    ("B", "V"): -2, ("E", "A"): 0, ("Y", "W"): 5, ("E", "E"): 6,
+    ("Y", "S"): -2, ("C", "N"): -1, ("V", "C"): -2, ("T", "H"): -2,
+    ("P", "R"): -1, ("V", "G"): -3, ("T", "L"): 0, ("V", "K"): -2,
+    ("K", "Q"): 0, ("R", "A"): -1, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): 0, ("K", "I"): -2, ("M", "D"): -3,
+    ("V", "W"): -3, ("W", "W"): 20, ("M", "H"): 2, ("P", "N"): -3,
+    ("K", "A"): 0, ("M", "L"): 2, ("K", "E"): 2, ("Z", "E"): 5,
+    ("X", "N"): 0, ("Z", "A"): 0, ("Z", "M"): -1, ("X", "F"): -1,
+    ("K", "C"): -3, ("B", "Q"): -1, ("X", "B"): -1, ("B", "M"): -2,
+    ("F", "C"): -3, ("Z", "Q"): 4, ("X", "Z"): 0, ("F", "G"): -3,
+    ("B", "E"): 0, ("X", "V"): 0, ("F", "K"): -1, ("B", "A"): 0,
+    ("X", "R"): -1, ("D", "D"): 9, ("W", "G"): 1, ("Z", "F"): -4,
+    ("S", "Q"): -1, ("W", "C"): -2, ("W", "K"): -2, ("H", "Q"): 0,
+    ("L", "C"): 0, ("W", "N"): -7, ("S", "A"): 1, ("L", "G"): -2,
+    ("W", "S"): -3, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -1,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): 3, ("Y", "H"): 0,
+    ("Y", "D"): -1, ("E", "R"): -1, ("X", "P"): -1, ("G", "G"): 8,
+    ("G", "C"): -4, ("E", "N"): -1, ("Y", "T"): -1, ("Y", "P"): -2,
+    ("T", "K"): -1, ("A", "A"): 4, ("P", "Q"): 0, ("T", "C"): -2,
+    ("V", "H"): -3, ("T", "G"): -2, ("I", "Q"): -2, ("Z", "T"): -1,
+    ("C", "R"): -2, ("V", "P"): -4, ("P", "E"): 1, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 6, ("P", "A"): -1, ("M", "G"): -2,
+    ("T", "S"): 2, ("I", "E"): -3, ("P", "M"): -4, ("M", "K"): 2,
+    ("I", "A"): 0, ("P", "I"): -3, ("R", "R"): 8, ("X", "M"): 0,
+    ("L", "I"): 2, ("X", "I"): 0, ("Z", "B"): 0, ("X", "E"): -1,
+    ("Z", "N"): -1, ("X", "A"): 0, ("B", "R"): -2, ("B", "N"): 4,
+    ("F", "D"): -5, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -3,
+    ("B", "F"): -3, ("F", "L"): 2, ("X", "Q"): 0, ("B", "B"): 5
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum35.cmp
+blosum35 = {
+    ("W", "F"): 1, ("L", "R"): -2, ("S", "P"): -2, ("V", "T"): 1,
+    ("Q", "Q"): 7, ("N", "A"): -1, ("Z", "Y"): -1, ("W", "R"): 0,
+    ("Q", "A"): 0, ("S", "D"): -1, ("H", "H"): 12, ("S", "H"): -1,
+    ("H", "D"): 0, ("L", "N"): -2, ("W", "A"): -2, ("Y", "M"): 0,
+    ("G", "R"): -2, ("Y", "I"): 0, ("Y", "E"): -1, ("B", "Y"): -2,
+    ("Y", "A"): -1, ("V", "D"): -2, ("B", "S"): 0, ("Y", "Y"): 8,
+    ("G", "N"): 1, ("E", "C"): -1, ("Y", "Q"): 0, ("Z", "Z"): 4,
+    ("V", "A"): 0, ("C", "C"): 15, ("M", "R"): 0, ("V", "E"): -2,
+    ("T", "N"): 0, ("P", "P"): 10, ("V", "I"): 4, ("V", "S"): -1,
+    ("Z", "P"): 0, ("V", "M"): 1, ("T", "F"): -1, ("V", "Q"): -3,
+    ("K", "K"): 5, ("P", "D"): -1, ("I", "H"): -3, ("I", "D"): -3,
+    ("T", "R"): -2, ("P", "L"): -3, ("K", "G"): -1, ("M", "N"): -1,
+    ("P", "H"): -1, ("F", "Q"): -4, ("Z", "G"): -2, ("X", "L"): 0,
+    ("T", "M"): 0, ("Z", "C"): -2, ("X", "H"): -1, ("D", "R"): -1,
+    ("B", "W"): -3, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -2,
+    ("Z", "W"): -1, ("F", "E"): -3, ("D", "N"): 1, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 1, ("B", "G"): 0, ("X", "T"): 0,
+    ("F", "M"): 0, ("B", "C"): -2, ("Z", "I"): -3, ("Z", "V"): -2,
+    ("S", "S"): 4, ("L", "Q"): -2, ("W", "E"): -1, ("Q", "R"): 2,
+    ("N", "N"): 7, ("W", "M"): 1, ("Q", "C"): -3, ("W", "I"): -1,
+    ("S", "C"): -3, ("L", "A"): -2, ("S", "G"): 1, ("L", "E"): -1,
+    ("W", "Q"): -1, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 1,
+    ("N", "R"): -1, ("H", "C"): -4, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): -2, ("V", "L"): 2, ("G", "E"): -2,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 2, ("Y", "R"): 0,
+    ("M", "Q"): -1, ("T", "I"): -1, ("C", "D"): -3, ("V", "F"): 1,
+    ("T", "A"): 0, ("T", "P"): 0, ("B", "P"): -1, ("T", "E"): -1,
+    ("V", "N"): -2, ("P", "G"): -2, ("M", "A"): 0, ("K", "H"): -2,
+    ("V", "R"): -1, ("P", "C"): -4, ("M", "E"): -2, ("K", "L"): -2,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): 0, ("I", "G"): -3,
+    ("P", "K"): 0, ("M", "M"): 6, ("K", "D"): -1, ("I", "C"): -4,
+    ("Z", "D"): 1, ("F", "R"): -1, ("X", "K"): 0, ("Q", "D"): -1,
+    ("X", "G"): -1, ("Z", "L"): -2, ("X", "C"): -2, ("Z", "H"): -1,
+    ("B", "L"): -2, ("B", "H"): 0, ("F", "F"): 8, ("X", "W"): -1,
+    ("B", "D"): 5, ("D", "A"): -1, ("S", "L"): -2, ("X", "S"): 0,
+    ("F", "N"): -1, ("S", "R"): -1, ("W", "D"): -3, ("V", "Y"): 0,
+    ("W", "L"): 0, ("H", "R"): -1, ("W", "H"): -4, ("H", "N"): 1,
+    ("W", "T"): -2, ("T", "T"): 5, ("S", "F"): -1, ("W", "P"): -4,
+    ("L", "D"): -2, ("B", "I"): -2, ("L", "H"): -2, ("S", "N"): 0,
+    ("B", "T"): -1, ("L", "L"): 5, ("Y", "K"): -1, ("E", "Q"): 2,
+    ("Y", "G"): -2, ("Z", "S"): 0, ("Y", "C"): -5, ("G", "D"): -2,
+    ("B", "V"): -2, ("E", "A"): -1, ("Y", "W"): 3, ("E", "E"): 6,
+    ("Y", "S"): -1, ("C", "N"): -1, ("V", "C"): -2, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -3, ("T", "L"): 0, ("V", "K"): -2,
+    ("K", "Q"): 0, ("R", "A"): -1, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -1, ("K", "I"): -2, ("M", "D"): -3,
+    ("V", "W"): -2, ("W", "W"): 16, ("M", "H"): 1, ("P", "N"): -2,
+    ("K", "A"): 0, ("M", "L"): 3, ("K", "E"): 1, ("Z", "E"): 5,
+    ("X", "N"): 0, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -1,
+    ("K", "C"): -2, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -2,
+    ("F", "C"): -4, ("Z", "Q"): 4, ("X", "Z"): 0, ("F", "G"): -3,
+    ("B", "E"): 0, ("X", "V"): 0, ("F", "K"): -1, ("B", "A"): -1,
+    ("X", "R"): -1, ("D", "D"): 8, ("W", "G"): -1, ("Z", "F"): -3,
+    ("S", "Q"): 0, ("W", "C"): -5, ("W", "K"): 0, ("H", "Q"): -1,
+    ("L", "C"): -2, ("W", "N"): -2, ("S", "A"): 1, ("L", "G"): -3,
+    ("W", "S"): -2, ("S", "E"): 0, ("H", "E"): -1, ("S", "I"): -2,
+    ("H", "A"): -2, ("S", "M"): -1, ("Y", "L"): 0, ("Y", "H"): 0,
+    ("Y", "D"): -2, ("E", "R"): -1, ("X", "P"): -1, ("G", "G"): 7,
+    ("G", "C"): -3, ("E", "N"): -1, ("Y", "T"): -2, ("Y", "P"): -3,
+    ("T", "K"): 0, ("A", "A"): 5, ("P", "Q"): 0, ("T", "C"): -1,
+    ("V", "H"): -4, ("T", "G"): -2, ("I", "Q"): -2, ("Z", "T"): -1,
+    ("C", "R"): -3, ("V", "P"): -3, ("P", "E"): 0, ("M", "C"): -4,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -2, ("M", "G"): -1,
+    ("T", "S"): 2, ("I", "E"): -3, ("P", "M"): -3, ("M", "K"): 0,
+    ("I", "A"): -1, ("P", "I"): -1, ("R", "R"): 8, ("X", "M"): 0,
+    ("L", "I"): 2, ("X", "I"): 0, ("Z", "B"): 0, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): 0, ("B", "R"): -1, ("B", "N"): 4,
+    ("F", "D"): -3, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -3,
+    ("B", "F"): -2, ("F", "L"): 2, ("X", "Q"): -1, ("B", "B"): 5
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum40.cmp
+blosum40 = {
+    ("W", "F"): 1, ("L", "R"): -2, ("S", "P"): -1, ("V", "T"): 1,
+    ("Q", "Q"): 8, ("N", "A"): -1, ("Z", "Y"): -2, ("W", "R"): -2,
+    ("Q", "A"): 0, ("S", "D"): 0, ("H", "H"): 13, ("S", "H"): -1,
+    ("H", "D"): 0, ("L", "N"): -3, ("W", "A"): -3, ("Y", "M"): 1,
+    ("G", "R"): -3, ("Y", "I"): 0, ("Y", "E"): -2, ("B", "Y"): -3,
+    ("Y", "A"): -2, ("V", "D"): -3, ("B", "S"): 0, ("Y", "Y"): 9,
+    ("G", "N"): 0, ("E", "C"): -2, ("Y", "Q"): -1, ("Z", "Z"): 5,
+    ("V", "A"): 0, ("C", "C"): 16, ("M", "R"): -1, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 11, ("V", "I"): 4, ("V", "S"): -1,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -1, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -2, ("I", "H"): -3, ("I", "D"): -4,
+    ("T", "R"): -2, ("P", "L"): -4, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): -2, ("F", "Q"): -4, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -3, ("X", "H"): -1, ("D", "R"): -1,
+    ("B", "W"): -4, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -3,
+    ("Z", "W"): -2, ("F", "E"): -3, ("D", "N"): 2, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 1, ("B", "G"): -1, ("X", "T"): 0,
+    ("F", "M"): 0, ("B", "C"): -2, ("Z", "I"): -4, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -2, ("W", "E"): -2, ("Q", "R"): 2,
+    ("N", "N"): 8, ("W", "M"): -2, ("Q", "C"): -4, ("W", "I"): -3,
+    ("S", "C"): -1, ("L", "A"): -2, ("S", "G"): 0, ("L", "E"): -2,
+    ("W", "Q"): -1, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 1,
+    ("N", "R"): 0, ("H", "C"): -4, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 4, ("C", "A"): -2, ("V", "L"): 2, ("G", "E"): -3,
+    ("G", "A"): 1, ("K", "R"): 3, ("E", "D"): 2, ("Y", "R"): -1,
+    ("M", "Q"): -1, ("T", "I"): -1, ("C", "D"): -2, ("V", "F"): 0,
+    ("T", "A"): 0, ("T", "P"): 0, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -1, ("M", "A"): -1, ("K", "H"): -1,
+    ("V", "R"): -2, ("P", "C"): -5, ("M", "E"): -2, ("K", "L"): -2,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -4,
+    ("P", "K"): -1, ("M", "M"): 7, ("K", "D"): 0, ("I", "C"): -4,
+    ("Z", "D"): 1, ("F", "R"): -2, ("X", "K"): -1, ("Q", "D"): -1,
+    ("X", "G"): -1, ("Z", "L"): -2, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -3, ("B", "H"): 0, ("F", "F"): 9, ("X", "W"): -2,
+    ("B", "D"): 6, ("D", "A"): -1, ("S", "L"): -3, ("X", "S"): 0,
+    ("F", "N"): -3, ("S", "R"): -1, ("W", "D"): -5, ("V", "Y"): -1,
+    ("W", "L"): -1, ("H", "R"): 0, ("W", "H"): -5, ("H", "N"): 1,
+    ("W", "T"): -4, ("T", "T"): 6, ("S", "F"): -2, ("W", "P"): -4,
+    ("L", "D"): -3, ("B", "I"): -3, ("L", "H"): -2, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 6, ("Y", "K"): -1, ("E", "Q"): 2,
+    ("Y", "G"): -3, ("Z", "S"): 0, ("Y", "C"): -4, ("G", "D"): -2,
+    ("B", "V"): -3, ("E", "A"): -1, ("Y", "W"): 3, ("E", "E"): 7,
+    ("Y", "S"): -2, ("C", "N"): -2, ("V", "C"): -2, ("T", "H"): -2,
+    ("P", "R"): -3, ("V", "G"): -4, ("T", "L"): -1, ("V", "K"): -2,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -2, ("K", "I"): -3, ("M", "D"): -3,
+    ("V", "W"): -3, ("W", "W"): 19, ("M", "H"): 1, ("P", "N"): -2,
+    ("K", "A"): -1, ("M", "L"): 3, ("K", "E"): 1, ("Z", "E"): 5,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -1,
+    ("K", "C"): -3, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -3,
+    ("F", "C"): -2, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -3,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -3, ("B", "A"): -1,
+    ("X", "R"): -1, ("D", "D"): 9, ("W", "G"): -2, ("Z", "F"): -4,
+    ("S", "Q"): 1, ("W", "C"): -6, ("W", "K"): -2, ("H", "Q"): 0,
+    ("L", "C"): -2, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -5, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -2,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): 0, ("Y", "H"): 2,
+    ("Y", "D"): -3, ("E", "R"): -1, ("X", "P"): -2, ("G", "G"): 8,
+    ("G", "C"): -3, ("E", "N"): -1, ("Y", "T"): -1, ("Y", "P"): -3,
+    ("T", "K"): 0, ("A", "A"): 5, ("P", "Q"): -2, ("T", "C"): -1,
+    ("V", "H"): -4, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -3, ("V", "P"): -3, ("P", "E"): 0, ("M", "C"): -3,
+    ("K", "N"): 0, ("I", "I"): 6, ("P", "A"): -2, ("M", "G"): -2,
+    ("T", "S"): 2, ("I", "E"): -4, ("P", "M"): -2, ("M", "K"): -1,
+    ("I", "A"): -1, ("P", "I"): -2, ("R", "R"): 9, ("X", "M"): 0,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 2, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): 0, ("B", "R"): -1, ("B", "N"): 4,
+    ("F", "D"): -4, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -2,
+    ("B", "F"): -3, ("F", "L"): 2, ("X", "Q"): -1, ("B", "B"): 5
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum45.cmp
+blosum45 = {
+    ("W", "F"): 1, ("L", "R"): -2, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 6, ("N", "A"): -1, ("Z", "Y"): -2, ("W", "R"): -2,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 10, ("S", "H"): -1,
+    ("H", "D"): 0, ("L", "N"): -3, ("W", "A"): -2, ("Y", "M"): 0,
+    ("G", "R"): -2, ("Y", "I"): 0, ("Y", "E"): -2, ("B", "Y"): -2,
+    ("Y", "A"): -2, ("V", "D"): -3, ("B", "S"): 0, ("Y", "Y"): 8,
+    ("G", "N"): 0, ("E", "C"): -3, ("Y", "Q"): -1, ("Z", "Z"): 4,
+    ("V", "A"): 0, ("C", "C"): 12, ("M", "R"): -1, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 9, ("V", "I"): 3, ("V", "S"): -1,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -1, ("V", "Q"): -3,
+    ("K", "K"): 5, ("P", "D"): -1, ("I", "H"): -3, ("I", "D"): -4,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): -2, ("F", "Q"): -4, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -3, ("X", "H"): -1, ("D", "R"): -1,
+    ("B", "W"): -4, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -2,
+    ("Z", "W"): -2, ("F", "E"): -3, ("D", "N"): 2, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): -1, ("X", "T"): 0,
+    ("F", "M"): 0, ("B", "C"): -2, ("Z", "I"): -3, ("Z", "V"): -3,
+    ("S", "S"): 4, ("L", "Q"): -2, ("W", "E"): -3, ("Q", "R"): 1,
+    ("N", "N"): 6, ("W", "M"): -2, ("Q", "C"): -3, ("W", "I"): -2,
+    ("S", "C"): -1, ("L", "A"): -1, ("S", "G"): 0, ("L", "E"): -2,
+    ("W", "Q"): -2, ("H", "G"): -2, ("S", "K"): -1, ("Q", "N"): 0,
+    ("N", "R"): 0, ("H", "C"): -3, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 1, ("G", "E"): -2,
+    ("G", "A"): 0, ("K", "R"): 3, ("E", "D"): 2, ("Y", "R"): -1,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -3, ("V", "F"): 0,
+    ("T", "A"): 0, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -2, ("M", "A"): -1, ("K", "H"): -1,
+    ("V", "R"): -2, ("P", "C"): -4, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 5, ("M", "I"): 2, ("T", "Q"): -1, ("I", "G"): -4,
+    ("P", "K"): -1, ("M", "M"): 6, ("K", "D"): 0, ("I", "C"): -3,
+    ("Z", "D"): 1, ("F", "R"): -2, ("X", "K"): -1, ("Q", "D"): 0,
+    ("X", "G"): -1, ("Z", "L"): -2, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -3, ("B", "H"): 0, ("F", "F"): 8, ("X", "W"): -2,
+    ("B", "D"): 5, ("D", "A"): -2, ("S", "L"): -3, ("X", "S"): 0,
+    ("F", "N"): -2, ("S", "R"): -1, ("W", "D"): -4, ("V", "Y"): -1,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -3, ("H", "N"): 1,
+    ("W", "T"): -3, ("T", "T"): 5, ("S", "F"): -2, ("W", "P"): -3,
+    ("L", "D"): -3, ("B", "I"): -3, ("L", "H"): -2, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 5, ("Y", "K"): -1, ("E", "Q"): 2,
+    ("Y", "G"): -3, ("Z", "S"): 0, ("Y", "C"): -3, ("G", "D"): -1,
+    ("B", "V"): -3, ("E", "A"): -1, ("Y", "W"): 3, ("E", "E"): 6,
+    ("Y", "S"): -2, ("C", "N"): -2, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -3, ("T", "L"): -1, ("V", "K"): -2,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -3, ("I", "N"): -2, ("K", "I"): -3, ("M", "D"): -3,
+    ("V", "W"): -3, ("W", "W"): 15, ("M", "H"): 0, ("P", "N"): -2,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 1, ("Z", "E"): 4,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -1, ("X", "F"): -1,
+    ("K", "C"): -3, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -2,
+    ("F", "C"): -2, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -3,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -3, ("B", "A"): -1,
+    ("X", "R"): -1, ("D", "D"): 7, ("W", "G"): -2, ("Z", "F"): -3,
+    ("S", "Q"): 0, ("W", "C"): -5, ("W", "K"): -2, ("H", "Q"): 1,
+    ("L", "C"): -2, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -3,
+    ("W", "S"): -4, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -2,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): 0, ("Y", "H"): 2,
+    ("Y", "D"): -2, ("E", "R"): 0, ("X", "P"): -1, ("G", "G"): 7,
+    ("G", "C"): -3, ("E", "N"): 0, ("Y", "T"): -1, ("Y", "P"): -3,
+    ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -1, ("T", "C"): -1,
+    ("V", "H"): -3, ("T", "G"): -2, ("I", "Q"): -2, ("Z", "T"): -1,
+    ("C", "R"): -3, ("V", "P"): -3, ("P", "E"): 0, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -2,
+    ("T", "S"): 2, ("I", "E"): -3, ("P", "M"): -2, ("M", "K"): -1,
+    ("I", "A"): -1, ("P", "I"): -2, ("R", "R"): 7, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 2, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): 0, ("B", "R"): -1, ("B", "N"): 4,
+    ("F", "D"): -4, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -2,
+    ("B", "F"): -3, ("F", "L"): 1, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum50.cmp
+blosum50 = {
+    ("W", "F"): 1, ("L", "R"): -3, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 7, ("N", "A"): -1, ("Z", "Y"): -2, ("W", "R"): -3,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 10, ("S", "H"): -1,
+    ("H", "D"): -1, ("L", "N"): -4, ("W", "A"): -3, ("Y", "M"): 0,
+    ("G", "R"): -3, ("Y", "I"): -1, ("Y", "E"): -2, ("B", "Y"): -3,
+    ("Y", "A"): -2, ("V", "D"): -4, ("B", "S"): 0, ("Y", "Y"): 8,
+    ("G", "N"): 0, ("E", "C"): -3, ("Y", "Q"): -1, ("Z", "Z"): 5,
+    ("V", "A"): 0, ("C", "C"): 13, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 10, ("V", "I"): 4, ("V", "S"): -2,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -2, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -1, ("I", "H"): -4, ("I", "D"): -4,
+    ("T", "R"): -1, ("P", "L"): -4, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): -2, ("F", "Q"): -4, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -3, ("X", "H"): -1, ("D", "R"): -2,
+    ("B", "W"): -5, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -3,
+    ("Z", "W"): -2, ("F", "E"): -3, ("D", "N"): 2, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): -1, ("X", "T"): 0,
+    ("F", "M"): 0, ("B", "C"): -3, ("Z", "I"): -3, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -2, ("W", "E"): -3, ("Q", "R"): 1,
+    ("N", "N"): 7, ("W", "M"): -1, ("Q", "C"): -3, ("W", "I"): -3,
+    ("S", "C"): -1, ("L", "A"): -2, ("S", "G"): 0, ("L", "E"): -3,
+    ("W", "Q"): -1, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -3, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 4, ("C", "A"): -1, ("V", "L"): 1, ("G", "E"): -3,
+    ("G", "A"): 0, ("K", "R"): 3, ("E", "D"): 2, ("Y", "R"): -1,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -4, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -2, ("M", "A"): -1, ("K", "H"): 0,
+    ("V", "R"): -3, ("P", "C"): -4, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 5, ("M", "I"): 2, ("T", "Q"): -1, ("I", "G"): -4,
+    ("P", "K"): -1, ("M", "M"): 7, ("K", "D"): -1, ("I", "C"): -2,
+    ("Z", "D"): 1, ("F", "R"): -3, ("X", "K"): -1, ("Q", "D"): 0,
+    ("X", "G"): -2, ("Z", "L"): -3, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -4, ("B", "H"): 0, ("F", "F"): 8, ("X", "W"): -3,
+    ("B", "D"): 5, ("D", "A"): -2, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -1, ("W", "D"): -5, ("V", "Y"): -1,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -3, ("H", "N"): 1,
+    ("W", "T"): -3, ("T", "T"): 5, ("S", "F"): -3, ("W", "P"): -4,
+    ("L", "D"): -4, ("B", "I"): -4, ("L", "H"): -3, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 5, ("Y", "K"): -2, ("E", "Q"): 2,
+    ("Y", "G"): -3, ("Z", "S"): 0, ("Y", "C"): -3, ("G", "D"): -1,
+    ("B", "V"): -4, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 6,
+    ("Y", "S"): -2, ("C", "N"): -2, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -3, ("V", "G"): -4, ("T", "L"): -1, ("V", "K"): -3,
+    ("K", "Q"): 2, ("R", "A"): -2, ("I", "R"): -4, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -3, ("K", "I"): -3, ("M", "D"): -4,
+    ("V", "W"): -3, ("W", "W"): 15, ("M", "H"): -1, ("P", "N"): -2,
+    ("K", "A"): -1, ("M", "L"): 3, ("K", "E"): 1, ("Z", "E"): 5,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -1, ("X", "F"): -2,
+    ("K", "C"): -3, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -3,
+    ("F", "C"): -2, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -4,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -4, ("B", "A"): -2,
+    ("X", "R"): -1, ("D", "D"): 8, ("W", "G"): -3, ("Z", "F"): -4,
+    ("S", "Q"): 0, ("W", "C"): -5, ("W", "K"): -3, ("H", "Q"): 1,
+    ("L", "C"): -2, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -4, ("S", "E"): -1, ("H", "E"): 0, ("S", "I"): -3,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -1, ("Y", "H"): 2,
+    ("Y", "D"): -3, ("E", "R"): 0, ("X", "P"): -2, ("G", "G"): 8,
+    ("G", "C"): -3, ("E", "N"): 0, ("Y", "T"): -2, ("Y", "P"): -3,
+    ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -1, ("T", "C"): -1,
+    ("V", "H"): -4, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -4, ("V", "P"): -3, ("P", "E"): -1, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -3,
+    ("T", "S"): 2, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -1, ("P", "I"): -3, ("R", "R"): 7, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 2, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): -1, ("B", "R"): -1, ("B", "N"): 4,
+    ("F", "D"): -5, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -1,
+    ("B", "F"): -4, ("F", "L"): 1, ("X", "Q"): -1, ("B", "B"): 5
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum55.cmp
+blosum55 = {
+    ("W", "F"): 1, ("L", "R"): -3, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 7, ("N", "A"): -1, ("Z", "Y"): -2, ("W", "R"): -3,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 10, ("S", "H"): -1,
+    ("H", "D"): -1, ("L", "N"): -4, ("W", "A"): -3, ("Y", "M"): 0,
+    ("G", "R"): -3, ("Y", "I"): -1, ("Y", "E"): -2, ("B", "Y"): -3,
+    ("Y", "A"): -2, ("V", "D"): -4, ("B", "S"): 0, ("Y", "Y"): 8,
+    ("G", "N"): 0, ("E", "C"): -3, ("Y", "Q"): -1, ("Z", "Z"): 5,
+    ("V", "A"): 0, ("C", "C"): 13, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 10, ("V", "I"): 4, ("V", "S"): -2,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -2, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -1, ("I", "H"): -4, ("I", "D"): -4,
+    ("T", "R"): -1, ("P", "L"): -4, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): -2, ("F", "Q"): -4, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -3, ("X", "H"): -1, ("D", "R"): -2,
+    ("B", "W"): -5, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -3,
+    ("Z", "W"): -2, ("F", "E"): -3, ("D", "N"): 2, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): -1, ("X", "T"): 0,
+    ("F", "M"): 0, ("B", "C"): -3, ("Z", "I"): -3, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -2, ("W", "E"): -3, ("Q", "R"): 1,
+    ("N", "N"): 7, ("W", "M"): -1, ("Q", "C"): -3, ("W", "I"): -3,
+    ("S", "C"): -1, ("L", "A"): -2, ("S", "G"): 0, ("L", "E"): -3,
+    ("W", "Q"): -1, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -3, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 4, ("C", "A"): -1, ("V", "L"): 1, ("G", "E"): -3,
+    ("G", "A"): 0, ("K", "R"): 3, ("E", "D"): 2, ("Y", "R"): -1,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -4, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -2, ("M", "A"): -1, ("K", "H"): 0,
+    ("V", "R"): -3, ("P", "C"): -4, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 5, ("M", "I"): 2, ("T", "Q"): -1, ("I", "G"): -4,
+    ("P", "K"): -1, ("M", "M"): 7, ("K", "D"): -1, ("I", "C"): -2,
+    ("Z", "D"): 1, ("F", "R"): -3, ("X", "K"): -1, ("Q", "D"): 0,
+    ("X", "G"): -2, ("Z", "L"): -3, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -4, ("B", "H"): 0, ("F", "F"): 8, ("X", "W"): -3,
+    ("B", "D"): 5, ("D", "A"): -2, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -1, ("W", "D"): -5, ("V", "Y"): -1,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -3, ("H", "N"): 1,
+    ("W", "T"): -3, ("T", "T"): 5, ("S", "F"): -3, ("W", "P"): -4,
+    ("L", "D"): -4, ("B", "I"): -4, ("L", "H"): -3, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 5, ("Y", "K"): -2, ("E", "Q"): 2,
+    ("Y", "G"): -3, ("Z", "S"): 0, ("Y", "C"): -3, ("G", "D"): -1,
+    ("B", "V"): -4, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 6,
+    ("Y", "S"): -2, ("C", "N"): -2, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -3, ("V", "G"): -4, ("T", "L"): -1, ("V", "K"): -3,
+    ("K", "Q"): 2, ("R", "A"): -2, ("I", "R"): -4, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -3, ("K", "I"): -3, ("M", "D"): -4,
+    ("V", "W"): -3, ("W", "W"): 15, ("M", "H"): -1, ("P", "N"): -2,
+    ("K", "A"): -1, ("M", "L"): 3, ("K", "E"): 1, ("Z", "E"): 5,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -1, ("X", "F"): -2,
+    ("K", "C"): -3, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -3,
+    ("F", "C"): -2, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -4,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -4, ("B", "A"): -2,
+    ("X", "R"): -1, ("D", "D"): 8, ("W", "G"): -3, ("Z", "F"): -4,
+    ("S", "Q"): 0, ("W", "C"): -5, ("W", "K"): -3, ("H", "Q"): 1,
+    ("L", "C"): -2, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -4, ("S", "E"): -1, ("H", "E"): 0, ("S", "I"): -3,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -1, ("Y", "H"): 2,
+    ("Y", "D"): -3, ("E", "R"): 0, ("X", "P"): -2, ("G", "G"): 8,
+    ("G", "C"): -3, ("E", "N"): 0, ("Y", "T"): -2, ("Y", "P"): -3,
+    ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -1, ("T", "C"): -1,
+    ("V", "H"): -4, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -4, ("V", "P"): -3, ("P", "E"): -1, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -3,
+    ("T", "S"): 2, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -1, ("P", "I"): -3, ("R", "R"): 7, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 2, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): -1, ("B", "R"): -1, ("B", "N"): 4,
+    ("F", "D"): -5, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -1,
+    ("B", "F"): -4, ("F", "L"): 1, ("X", "Q"): -1, ("B", "B"): 5
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum60.cmp
+blosum60 = {
+    ("W", "F"): 1, ("L", "R"): -2, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 5, ("N", "A"): -1, ("Z", "Y"): -2, ("W", "R"): -3,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 7, ("S", "H"): -1,
+    ("H", "D"): -1, ("L", "N"): -3, ("W", "A"): -3, ("Y", "M"): -1,
+    ("G", "R"): -2, ("Y", "I"): -1, ("Y", "E"): -2, ("B", "Y"): -2,
+    ("Y", "A"): -2, ("V", "D"): -3, ("B", "S"): 0, ("Y", "Y"): 6,
+    ("G", "N"): 0, ("E", "C"): -3, ("Y", "Q"): -1, ("Z", "Z"): 3,
+    ("V", "A"): 0, ("C", "C"): 9, ("M", "R"): -1, ("V", "E"): -2,
+    ("T", "N"): 0, ("P", "P"): 7, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -2, ("V", "Q"): -2,
+    ("K", "K"): 4, ("P", "D"): -1, ("I", "H"): -3, ("I", "D"): -3,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -1, ("M", "N"): -2,
+    ("P", "H"): -2, ("F", "Q"): -3, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -3, ("X", "H"): -1, ("D", "R"): -1,
+    ("B", "W"): -4, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -2,
+    ("Z", "W"): -2, ("F", "E"): -3, ("D", "N"): 1, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): -1, ("X", "T"): 0,
+    ("F", "M"): 0, ("B", "C"): -3, ("Z", "I"): -3, ("Z", "V"): -2,
+    ("S", "S"): 4, ("L", "Q"): -2, ("W", "E"): -3, ("Q", "R"): 1,
+    ("N", "N"): 6, ("W", "M"): -1, ("Q", "C"): -3, ("W", "I"): -2,
+    ("S", "C"): -1, ("L", "A"): -1, ("S", "G"): 0, ("L", "E"): -3,
+    ("W", "Q"): -2, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 0,
+    ("N", "R"): 0, ("H", "C"): -3, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): 0, ("V", "L"): 1, ("G", "E"): -2,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 2, ("Y", "R"): -2,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -3, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -2, ("M", "A"): -1, ("K", "H"): -1,
+    ("V", "R"): -2, ("P", "C"): -3, ("M", "E"): -2, ("K", "L"): -2,
+    ("V", "V"): 4, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -3,
+    ("P", "K"): -1, ("M", "M"): 5, ("K", "D"): -1, ("I", "C"): -1,
+    ("Z", "D"): 1, ("F", "R"): -3, ("X", "K"): -1, ("Q", "D"): 0,
+    ("X", "G"): -1, ("Z", "L"): -2, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -3, ("B", "H"): 0, ("F", "F"): 6, ("X", "W"): -2,
+    ("B", "D"): 4, ("D", "A"): -2, ("S", "L"): -2, ("X", "S"): 0,
+    ("F", "N"): -3, ("S", "R"): -1, ("W", "D"): -4, ("V", "Y"): -1,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -2, ("H", "N"): 1,
+    ("W", "T"): -2, ("T", "T"): 4, ("S", "F"): -2, ("W", "P"): -4,
+    ("L", "D"): -3, ("B", "I"): -3, ("L", "H"): -3, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 4, ("Y", "K"): -2, ("E", "Q"): 2,
+    ("Y", "G"): -3, ("Z", "S"): 0, ("Y", "C"): -2, ("G", "D"): -1,
+    ("B", "V"): -3, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 5,
+    ("Y", "S"): -2, ("C", "N"): -2, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -3, ("T", "L"): -1, ("V", "K"): -2,
+    ("K", "Q"): 1, ("R", "A"): -1, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -3, ("K", "I"): -3, ("M", "D"): -3,
+    ("V", "W"): -3, ("W", "W"): 10, ("M", "H"): -1, ("P", "N"): -2,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 1, ("Z", "E"): 4,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -1, ("X", "F"): -1,
+    ("K", "C"): -3, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -3,
+    ("F", "C"): -2, ("Z", "Q"): 3, ("X", "Z"): -1, ("F", "G"): -3,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -3, ("B", "A"): -2,
+    ("X", "R"): -1, ("D", "D"): 6, ("W", "G"): -2, ("Z", "F"): -3,
+    ("S", "Q"): 0, ("W", "C"): -2, ("W", "K"): -3, ("H", "Q"): 1,
+    ("L", "C"): -1, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -3, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -2,
+    ("H", "A"): -2, ("S", "M"): -1, ("Y", "L"): -1, ("Y", "H"): 2,
+    ("Y", "D"): -3, ("E", "R"): 0, ("X", "P"): -2, ("G", "G"): 6,
+    ("G", "C"): -2, ("E", "N"): 0, ("Y", "T"): -2, ("Y", "P"): -3,
+    ("T", "K"): -1, ("A", "A"): 4, ("P", "Q"): -1, ("T", "C"): -1,
+    ("V", "H"): -3, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -3, ("V", "P"): -2, ("P", "E"): -1, ("M", "C"): -1,
+    ("K", "N"): 0, ("I", "I"): 4, ("P", "A"): -1, ("M", "G"): -2,
+    ("T", "S"): 1, ("I", "E"): -3, ("P", "M"): -2, ("M", "K"): -1,
+    ("I", "A"): -1, ("P", "I"): -3, ("R", "R"): 5, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 1, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): 0, ("B", "R"): -1, ("B", "N"): 3,
+    ("F", "D"): -3, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -1,
+    ("B", "F"): -3, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum62.cmp
+blosum62 = {
+    ("W", "F"): 1, ("L", "R"): -2, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 5, ("N", "A"): -2, ("Z", "Y"): -2, ("W", "R"): -3,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 8, ("S", "H"): -1,
+    ("H", "D"): -1, ("L", "N"): -3, ("W", "A"): -3, ("Y", "M"): -1,
+    ("G", "R"): -2, ("Y", "I"): -1, ("Y", "E"): -2, ("B", "Y"): -3,
+    ("Y", "A"): -2, ("V", "D"): -3, ("B", "S"): 0, ("Y", "Y"): 7,
+    ("G", "N"): 0, ("E", "C"): -4, ("Y", "Q"): -1, ("Z", "Z"): 4,
+    ("V", "A"): 0, ("C", "C"): 9, ("M", "R"): -1, ("V", "E"): -2,
+    ("T", "N"): 0, ("P", "P"): 7, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -2, ("V", "Q"): -2,
+    ("K", "K"): 5, ("P", "D"): -1, ("I", "H"): -3, ("I", "D"): -3,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): -2, ("F", "Q"): -3, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -3, ("X", "H"): -1, ("D", "R"): -2,
+    ("B", "W"): -4, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -2,
+    ("Z", "W"): -3, ("F", "E"): -3, ("D", "N"): 1, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): -1, ("X", "T"): 0,
+    ("F", "M"): 0, ("B", "C"): -3, ("Z", "I"): -3, ("Z", "V"): -2,
+    ("S", "S"): 4, ("L", "Q"): -2, ("W", "E"): -3, ("Q", "R"): 1,
+    ("N", "N"): 6, ("W", "M"): -1, ("Q", "C"): -3, ("W", "I"): -3,
+    ("S", "C"): -1, ("L", "A"): -1, ("S", "G"): 0, ("L", "E"): -3,
+    ("W", "Q"): -2, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 0,
+    ("N", "R"): 0, ("H", "C"): -3, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): 0, ("V", "L"): 1, ("G", "E"): -2,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 2, ("Y", "R"): -2,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -3, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -2, ("M", "A"): -1, ("K", "H"): -1,
+    ("V", "R"): -3, ("P", "C"): -3, ("M", "E"): -2, ("K", "L"): -2,
+    ("V", "V"): 4, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -4,
+    ("P", "K"): -1, ("M", "M"): 5, ("K", "D"): -1, ("I", "C"): -1,
+    ("Z", "D"): 1, ("F", "R"): -3, ("X", "K"): -1, ("Q", "D"): 0,
+    ("X", "G"): -1, ("Z", "L"): -3, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -4, ("B", "H"): 0, ("F", "F"): 6, ("X", "W"): -2,
+    ("B", "D"): 4, ("D", "A"): -2, ("S", "L"): -2, ("X", "S"): 0,
+    ("F", "N"): -3, ("S", "R"): -1, ("W", "D"): -4, ("V", "Y"): -1,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -2, ("H", "N"): 1,
+    ("W", "T"): -2, ("T", "T"): 5, ("S", "F"): -2, ("W", "P"): -4,
+    ("L", "D"): -4, ("B", "I"): -3, ("L", "H"): -3, ("S", "N"): 1,
+    ("B", "T"): -1, ("L", "L"): 4, ("Y", "K"): -2, ("E", "Q"): 2,
+    ("Y", "G"): -3, ("Z", "S"): 0, ("Y", "C"): -2, ("G", "D"): -1,
+    ("B", "V"): -3, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 5,
+    ("Y", "S"): -2, ("C", "N"): -3, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -3, ("T", "L"): -1, ("V", "K"): -2,
+    ("K", "Q"): 1, ("R", "A"): -1, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -3, ("K", "I"): -3, ("M", "D"): -3,
+    ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -2, ("P", "N"): -2,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 1, ("Z", "E"): 4,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -1, ("X", "F"): -1,
+    ("K", "C"): -3, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -3,
+    ("F", "C"): -2, ("Z", "Q"): 3, ("X", "Z"): -1, ("F", "G"): -3,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -3, ("B", "A"): -2,
+    ("X", "R"): -1, ("D", "D"): 6, ("W", "G"): -2, ("Z", "F"): -3,
+    ("S", "Q"): 0, ("W", "C"): -2, ("W", "K"): -3, ("H", "Q"): 0,
+    ("L", "C"): -1, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -3, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -2,
+    ("H", "A"): -2, ("S", "M"): -1, ("Y", "L"): -1, ("Y", "H"): 2,
+    ("Y", "D"): -3, ("E", "R"): 0, ("X", "P"): -2, ("G", "G"): 6,
+    ("G", "C"): -3, ("E", "N"): 0, ("Y", "T"): -2, ("Y", "P"): -3,
+    ("T", "K"): -1, ("A", "A"): 4, ("P", "Q"): -1, ("T", "C"): -1,
+    ("V", "H"): -3, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -3, ("V", "P"): -2, ("P", "E"): -1, ("M", "C"): -1,
+    ("K", "N"): 0, ("I", "I"): 4, ("P", "A"): -1, ("M", "G"): -3,
+    ("T", "S"): 1, ("I", "E"): -3, ("P", "M"): -2, ("M", "K"): -1,
+    ("I", "A"): -1, ("P", "I"): -3, ("R", "R"): 5, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 1, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): 0, ("B", "R"): -1, ("B", "N"): 3,
+    ("F", "D"): -3, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -1,
+    ("B", "F"): -3, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum65.cmp
+blosum65 = {
+    ("W", "F"): 1, ("L", "R"): -2, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 6, ("N", "A"): -2, ("Z", "Y"): -2, ("W", "R"): -3,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 8, ("S", "H"): -1,
+    ("H", "D"): -1, ("L", "N"): -4, ("W", "A"): -3, ("Y", "M"): -1,
+    ("G", "R"): -2, ("Y", "I"): -1, ("Y", "E"): -2, ("B", "Y"): -3,
+    ("Y", "A"): -2, ("V", "D"): -3, ("B", "S"): 0, ("Y", "Y"): 7,
+    ("G", "N"): -1, ("E", "C"): -4, ("Y", "Q"): -2, ("Z", "Z"): 4,
+    ("V", "A"): 0, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -2, ("V", "Q"): -2,
+    ("K", "K"): 5, ("P", "D"): -2, ("I", "H"): -3, ("I", "D"): -3,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): -2, ("F", "Q"): -3, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -4, ("X", "H"): -1, ("D", "R"): -2,
+    ("B", "W"): -4, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -2,
+    ("Z", "W"): -3, ("F", "E"): -3, ("D", "N"): 1, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): -1, ("X", "T"): -1,
+    ("F", "M"): 0, ("B", "C"): -3, ("Z", "I"): -3, ("Z", "V"): -2,
+    ("S", "S"): 4, ("L", "Q"): -2, ("W", "E"): -3, ("Q", "R"): 1,
+    ("N", "N"): 6, ("W", "M"): -2, ("Q", "C"): -3, ("W", "I"): -2,
+    ("S", "C"): -1, ("L", "A"): -2, ("S", "G"): 0, ("L", "E"): -3,
+    ("W", "Q"): -2, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 0,
+    ("N", "R"): 0, ("H", "C"): -3, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): 0, ("V", "L"): 1, ("G", "E"): -2,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 2, ("Y", "R"): -2,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -4, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -2, ("M", "A"): -1, ("K", "H"): -1,
+    ("V", "R"): -3, ("P", "C"): -3, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 4, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -4,
+    ("P", "K"): -1, ("M", "M"): 6, ("K", "D"): -1, ("I", "C"): -1,
+    ("Z", "D"): 1, ("F", "R"): -3, ("X", "K"): -1, ("Q", "D"): 0,
+    ("X", "G"): -2, ("Z", "L"): -3, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -4, ("B", "H"): 0, ("F", "F"): 6, ("X", "W"): -2,
+    ("B", "D"): 4, ("D", "A"): -2, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -3, ("S", "R"): -1, ("W", "D"): -5, ("V", "Y"): -1,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -2, ("H", "N"): 1,
+    ("W", "T"): -3, ("T", "T"): 5, ("S", "F"): -2, ("W", "P"): -4,
+    ("L", "D"): -4, ("B", "I"): -3, ("L", "H"): -3, ("S", "N"): 1,
+    ("B", "T"): -1, ("L", "L"): 4, ("Y", "K"): -2, ("E", "Q"): 2,
+    ("Y", "G"): -3, ("Z", "S"): 0, ("Y", "C"): -2, ("G", "D"): -1,
+    ("B", "V"): -3, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 5,
+    ("Y", "S"): -2, ("C", "N"): -3, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -3, ("T", "L"): -1, ("V", "K"): -2,
+    ("K", "Q"): 1, ("R", "A"): -1, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -3, ("K", "I"): -3, ("M", "D"): -3,
+    ("V", "W"): -3, ("W", "W"): 10, ("M", "H"): -2, ("P", "N"): -2,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 1, ("Z", "E"): 4,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -3, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -3,
+    ("F", "C"): -2, ("Z", "Q"): 3, ("X", "Z"): -1, ("F", "G"): -3,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -3, ("B", "A"): -2,
+    ("X", "R"): -1, ("D", "D"): 6, ("W", "G"): -3, ("Z", "F"): -3,
+    ("S", "Q"): 0, ("W", "C"): -2, ("W", "K"): -3, ("H", "Q"): 1,
+    ("L", "C"): -1, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -3, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -2,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -1, ("Y", "H"): 2,
+    ("Y", "D"): -3, ("E", "R"): 0, ("X", "P"): -2, ("G", "G"): 6,
+    ("G", "C"): -3, ("E", "N"): 0, ("Y", "T"): -2, ("Y", "P"): -3,
+    ("T", "K"): -1, ("A", "A"): 4, ("P", "Q"): -1, ("T", "C"): -1,
+    ("V", "H"): -3, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -4, ("V", "P"): -2, ("P", "E"): -1, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 4, ("P", "A"): -1, ("M", "G"): -3,
+    ("T", "S"): 1, ("I", "E"): -3, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -1, ("P", "I"): -3, ("R", "R"): 6, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 1, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): -1, ("B", "R"): -1, ("B", "N"): 3,
+    ("F", "D"): -4, ("X", "Y"): -1, ("Z", "R"): 0, ("F", "H"): -1,
+    ("B", "F"): -3, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum70.cmp
+blosum70 = {
+    ("W", "F"): 1, ("L", "R"): -3, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 6, ("N", "A"): -2, ("Z", "Y"): -2, ("W", "R"): -3,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 8, ("S", "H"): -1,
+    ("H", "D"): -1, ("L", "N"): -4, ("W", "A"): -3, ("Y", "M"): -1,
+    ("G", "R"): -3, ("Y", "I"): -1, ("Y", "E"): -3, ("B", "Y"): -3,
+    ("Y", "A"): -2, ("V", "D"): -4, ("B", "S"): 0, ("Y", "Y"): 7,
+    ("G", "N"): -1, ("E", "C"): -4, ("Y", "Q"): -2, ("Z", "Z"): 4,
+    ("V", "A"): 0, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -2, ("V", "Q"): -2,
+    ("K", "K"): 5, ("P", "D"): -2, ("I", "H"): -4, ("I", "D"): -4,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): -2, ("F", "Q"): -3, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -4, ("X", "H"): -1, ("D", "R"): -2,
+    ("B", "W"): -4, ("X", "D"): -2, ("Z", "K"): 1, ("F", "A"): -2,
+    ("Z", "W"): -3, ("F", "E"): -4, ("D", "N"): 1, ("B", "K"): -1,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): -1, ("X", "T"): -1,
+    ("F", "M"): 0, ("B", "C"): -4, ("Z", "I"): -3, ("Z", "V"): -3,
+    ("S", "S"): 4, ("L", "Q"): -2, ("W", "E"): -4, ("Q", "R"): 1,
+    ("N", "N"): 6, ("W", "M"): -2, ("Q", "C"): -3, ("W", "I"): -3,
+    ("S", "C"): -1, ("L", "A"): -2, ("S", "G"): -1, ("L", "E"): -3,
+    ("W", "Q"): -2, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -4, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 1, ("G", "E"): -2,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -2,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -4, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -3, ("M", "A"): -1, ("K", "H"): -1,
+    ("V", "R"): -3, ("P", "C"): -3, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 4, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -4,
+    ("P", "K"): -1, ("M", "M"): 6, ("K", "D"): -1, ("I", "C"): -1,
+    ("Z", "D"): 1, ("F", "R"): -3, ("X", "K"): -1, ("Q", "D"): -1,
+    ("X", "G"): -2, ("Z", "L"): -3, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -4, ("B", "H"): -1, ("F", "F"): 6, ("X", "W"): -3,
+    ("B", "D"): 4, ("D", "A"): -2, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -3, ("S", "R"): -1, ("W", "D"): -5, ("V", "Y"): -2,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -2, ("H", "N"): 0,
+    ("W", "T"): -3, ("T", "T"): 5, ("S", "F"): -3, ("W", "P"): -4,
+    ("L", "D"): -4, ("B", "I"): -4, ("L", "H"): -3, ("S", "N"): 0,
+    ("B", "T"): -1, ("L", "L"): 4, ("Y", "K"): -2, ("E", "Q"): 2,
+    ("Y", "G"): -4, ("Z", "S"): 0, ("Y", "C"): -3, ("G", "D"): -2,
+    ("B", "V"): -3, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 5,
+    ("Y", "S"): -2, ("C", "N"): -3, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -4, ("T", "L"): -2, ("V", "K"): -3,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -4, ("K", "I"): -3, ("M", "D"): -3,
+    ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -2, ("P", "N"): -2,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 1, ("Z", "E"): 4,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -4, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -3,
+    ("F", "C"): -2, ("Z", "Q"): 3, ("X", "Z"): -1, ("F", "G"): -4,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -3, ("B", "A"): -2,
+    ("X", "R"): -1, ("D", "D"): 6, ("W", "G"): -3, ("Z", "F"): -4,
+    ("S", "Q"): 0, ("W", "C"): -3, ("W", "K"): -3, ("H", "Q"): 1,
+    ("L", "C"): -2, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -3, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -3,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -1, ("Y", "H"): 2,
+    ("Y", "D"): -4, ("E", "R"): 0, ("X", "P"): -2, ("G", "G"): 6,
+    ("G", "C"): -3, ("E", "N"): 0, ("Y", "T"): -2, ("Y", "P"): -3,
+    ("T", "K"): -1, ("A", "A"): 4, ("P", "Q"): -2, ("T", "C"): -1,
+    ("V", "H"): -3, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -4, ("V", "P"): -3, ("P", "E"): -1, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 4, ("P", "A"): -1, ("M", "G"): -3,
+    ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -2, ("P", "I"): -3, ("R", "R"): 6, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 0, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): -1, ("B", "R"): -1, ("B", "N"): 3,
+    ("F", "D"): -4, ("X", "Y"): -2, ("Z", "R"): 0, ("F", "H"): -1,
+    ("B", "F"): -4, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum75.cmp
+blosum75 = {
+    ("W", "F"): 1, ("L", "R"): -3, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 6, ("N", "A"): -2, ("Z", "Y"): -3, ("W", "R"): -3,
+    ("Q", "A"): -1, ("S", "D"): -1, ("H", "H"): 8, ("S", "H"): -1,
+    ("H", "D"): -1, ("L", "N"): -4, ("W", "A"): -3, ("Y", "M"): -2,
+    ("G", "R"): -3, ("Y", "I"): -2, ("Y", "E"): -3, ("B", "Y"): -3,
+    ("Y", "A"): -2, ("V", "D"): -4, ("B", "S"): 0, ("Y", "Y"): 7,
+    ("G", "N"): -1, ("E", "C"): -5, ("Y", "Q"): -2, ("Z", "Z"): 4,
+    ("V", "A"): 0, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -2, ("V", "M"): 1, ("T", "F"): -2, ("V", "Q"): -2,
+    ("K", "K"): 5, ("P", "D"): -2, ("I", "H"): -4, ("I", "D"): -4,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -2, ("M", "N"): -3,
+    ("P", "H"): -2, ("F", "Q"): -4, ("Z", "G"): -2, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -4, ("X", "H"): -1, ("D", "R"): -2,
+    ("B", "W"): -5, ("X", "D"): -2, ("Z", "K"): 1, ("F", "A"): -3,
+    ("Z", "W"): -3, ("F", "E"): -4, ("D", "N"): 1, ("B", "K"): -1,
+    ("X", "X"): -1, ("F", "I"): 0, ("B", "G"): -1, ("X", "T"): -1,
+    ("F", "M"): 0, ("B", "C"): -4, ("Z", "I"): -4, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -3, ("W", "E"): -4, ("Q", "R"): 1,
+    ("N", "N"): 6, ("W", "M"): -2, ("Q", "C"): -3, ("W", "I"): -3,
+    ("S", "C"): -1, ("L", "A"): -2, ("S", "G"): -1, ("L", "E"): -4,
+    ("W", "Q"): -2, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -4, ("Y", "N"): -3, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 1, ("G", "E"): -3,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -2,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -4, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -3, ("M", "A"): -1, ("K", "H"): -1,
+    ("V", "R"): -3, ("P", "C"): -4, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 4, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -5,
+    ("P", "K"): -1, ("M", "M"): 6, ("K", "D"): -1, ("I", "C"): -1,
+    ("Z", "D"): 1, ("F", "R"): -3, ("X", "K"): -1, ("Q", "D"): -1,
+    ("X", "G"): -2, ("Z", "L"): -3, ("X", "C"): -2, ("Z", "H"): 0,
+    ("B", "L"): -4, ("B", "H"): -1, ("F", "F"): 6, ("X", "W"): -3,
+    ("B", "D"): 4, ("D", "A"): -2, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -1, ("W", "D"): -5, ("V", "Y"): -2,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -2, ("H", "N"): 0,
+    ("W", "T"): -3, ("T", "T"): 5, ("S", "F"): -3, ("W", "P"): -5,
+    ("L", "D"): -4, ("B", "I"): -4, ("L", "H"): -3, ("S", "N"): 0,
+    ("B", "T"): -1, ("L", "L"): 4, ("Y", "K"): -2, ("E", "Q"): 2,
+    ("Y", "G"): -4, ("Z", "S"): 0, ("Y", "C"): -3, ("G", "D"): -2,
+    ("B", "V"): -4, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 5,
+    ("Y", "S"): -2, ("C", "N"): -3, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -4, ("T", "L"): -2, ("V", "K"): -3,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -4, ("K", "I"): -3, ("M", "D"): -4,
+    ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -2, ("P", "N"): -3,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 1, ("Z", "E"): 4,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -4, ("B", "Q"): 0, ("X", "B"): -2, ("B", "M"): -3,
+    ("F", "C"): -2, ("Z", "Q"): 3, ("X", "Z"): -1, ("F", "G"): -4,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -4, ("B", "A"): -2,
+    ("X", "R"): -1, ("D", "D"): 6, ("W", "G"): -3, ("Z", "F"): -4,
+    ("S", "Q"): 0, ("W", "C"): -3, ("W", "K"): -4, ("H", "Q"): 1,
+    ("L", "C"): -2, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -3, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -3,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -1, ("Y", "H"): 2,
+    ("Y", "D"): -4, ("E", "R"): 0, ("X", "P"): -2, ("G", "G"): 6,
+    ("G", "C"): -3, ("E", "N"): -1, ("Y", "T"): -2, ("Y", "P"): -4,
+    ("T", "K"): -1, ("A", "A"): 4, ("P", "Q"): -2, ("T", "C"): -1,
+    ("V", "H"): -4, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -4, ("V", "P"): -3, ("P", "E"): -1, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 4, ("P", "A"): -1, ("M", "G"): -3,
+    ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -2, ("P", "I"): -3, ("R", "R"): 6, ("X", "M"): -1,
+    ("L", "I"): 1, ("X", "I"): -2, ("Z", "B"): 0, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): -1, ("B", "R"): -1, ("B", "N"): 3,
+    ("F", "D"): -4, ("X", "Y"): -2, ("Z", "R"): 0, ("F", "H"): -2,
+    ("B", "F"): -4, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum80.cmp
+blosum80 = {
+    ("W", "F"): 0, ("L", "R"): -3, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 6, ("N", "A"): -2, ("Z", "Y"): -3, ("W", "R"): -4,
+    ("Q", "A"): -1, ("S", "D"): -1, ("H", "H"): 8, ("S", "H"): -1,
+    ("H", "D"): -2, ("L", "N"): -4, ("W", "A"): -3, ("Y", "M"): -2,
+    ("G", "R"): -3, ("Y", "I"): -2, ("Y", "E"): -3, ("B", "Y"): -3,
+    ("Y", "A"): -2, ("V", "D"): -4, ("B", "S"): 0, ("Y", "Y"): 7,
+    ("G", "N"): -1, ("E", "C"): -5, ("Y", "Q"): -2, ("Z", "Z"): 4,
+    ("V", "A"): 0, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -2, ("V", "M"): 1, ("T", "F"): -2, ("V", "Q"): -3,
+    ("K", "K"): 5, ("P", "D"): -2, ("I", "H"): -4, ("I", "D"): -4,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -2, ("M", "N"): -3,
+    ("P", "H"): -3, ("F", "Q"): -4, ("Z", "G"): -3, ("X", "L"): -2,
+    ("T", "M"): -1, ("Z", "C"): -4, ("X", "H"): -2, ("D", "R"): -2,
+    ("B", "W"): -5, ("X", "D"): -2, ("Z", "K"): 1, ("F", "A"): -3,
+    ("Z", "W"): -4, ("F", "E"): -4, ("D", "N"): 1, ("B", "K"): -1,
+    ("X", "X"): -1, ("F", "I"): -1, ("B", "G"): -1, ("X", "T"): -1,
+    ("F", "M"): 0, ("B", "C"): -4, ("Z", "I"): -4, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -3, ("W", "E"): -4, ("Q", "R"): 1,
+    ("N", "N"): 6, ("W", "M"): -2, ("Q", "C"): -4, ("W", "I"): -3,
+    ("S", "C"): -2, ("L", "A"): -2, ("S", "G"): -1, ("L", "E"): -4,
+    ("W", "Q"): -3, ("H", "G"): -3, ("S", "K"): -1, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -4, ("Y", "N"): -3, ("G", "Q"): -2,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 1, ("G", "E"): -3,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -3,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -4, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -2, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -4, ("P", "G"): -3, ("M", "A"): -1, ("K", "H"): -1,
+    ("V", "R"): -3, ("P", "C"): -4, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 4, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -5,
+    ("P", "K"): -1, ("M", "M"): 6, ("K", "D"): -1, ("I", "C"): -2,
+    ("Z", "D"): 1, ("F", "R"): -4, ("X", "K"): -1, ("Q", "D"): -1,
+    ("X", "G"): -2, ("Z", "L"): -3, ("X", "C"): -3, ("Z", "H"): 0,
+    ("B", "L"): -4, ("B", "H"): -1, ("F", "F"): 6, ("X", "W"): -3,
+    ("B", "D"): 4, ("D", "A"): -2, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -1, ("W", "D"): -6, ("V", "Y"): -2,
+    ("W", "L"): -2, ("H", "R"): 0, ("W", "H"): -3, ("H", "N"): 0,
+    ("W", "T"): -4, ("T", "T"): 5, ("S", "F"): -3, ("W", "P"): -5,
+    ("L", "D"): -5, ("B", "I"): -4, ("L", "H"): -3, ("S", "N"): 0,
+    ("B", "T"): -1, ("L", "L"): 4, ("Y", "K"): -3, ("E", "Q"): 2,
+    ("Y", "G"): -4, ("Z", "S"): 0, ("Y", "C"): -3, ("G", "D"): -2,
+    ("B", "V"): -4, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 6,
+    ("Y", "S"): -2, ("C", "N"): -3, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -4, ("T", "L"): -2, ("V", "K"): -3,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -4, ("I", "N"): -4, ("K", "I"): -3, ("M", "D"): -4,
+    ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -2, ("P", "N"): -3,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 1, ("Z", "E"): 4,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -4, ("B", "Q"): 0, ("X", "B"): -2, ("B", "M"): -3,
+    ("F", "C"): -3, ("Z", "Q"): 3, ("X", "Z"): -1, ("F", "G"): -4,
+    ("B", "E"): 1, ("X", "V"): -1, ("F", "K"): -4, ("B", "A"): -2,
+    ("X", "R"): -1, ("D", "D"): 6, ("W", "G"): -4, ("Z", "F"): -4,
+    ("S", "Q"): 0, ("W", "C"): -3, ("W", "K"): -4, ("H", "Q"): 1,
+    ("L", "C"): -2, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -4, ("S", "E"): 0, ("H", "E"): 0, ("S", "I"): -3,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -2, ("Y", "H"): 2,
+    ("Y", "D"): -4, ("E", "R"): -1, ("X", "P"): -2, ("G", "G"): 6,
+    ("G", "C"): -4, ("E", "N"): -1, ("Y", "T"): -2, ("Y", "P"): -4,
+    ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -2, ("T", "C"): -1,
+    ("V", "H"): -4, ("T", "G"): -2, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -4, ("V", "P"): -3, ("P", "E"): -2, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -4,
+    ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -2, ("P", "I"): -4, ("R", "R"): 6, ("X", "M"): -1,
+    ("L", "I"): 1, ("X", "I"): -2, ("Z", "B"): 0, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 4,
+    ("F", "D"): -4, ("X", "Y"): -2, ("Z", "R"): 0, ("F", "H"): -2,
+    ("B", "F"): -4, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum85.cmp
+blosum85 = {
+    ("W", "F"): 0, ("L", "R"): -3, ("S", "P"): -1, ("V", "T"): 0,
+    ("Q", "Q"): 6, ("N", "A"): -2, ("Z", "Y"): -3, ("W", "R"): -4,
+    ("Q", "A"): -1, ("S", "D"): -1, ("H", "H"): 8, ("S", "H"): -1,
+    ("H", "D"): -2, ("L", "N"): -4, ("W", "A"): -3, ("Y", "M"): -2,
+    ("G", "R"): -3, ("Y", "I"): -2, ("Y", "E"): -4, ("B", "Y"): -4,
+    ("Y", "A"): -3, ("V", "D"): -4, ("B", "S"): 0, ("Y", "Y"): 7,
+    ("G", "N"): -1, ("E", "C"): -5, ("Y", "Q"): -2, ("Z", "Z"): 4,
+    ("V", "A"): -1, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -2, ("V", "M"): 0, ("T", "F"): -3, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -2, ("I", "H"): -4, ("I", "D"): -5,
+    ("T", "R"): -2, ("P", "L"): -4, ("K", "G"): -2, ("M", "N"): -3,
+    ("P", "H"): -3, ("F", "Q"): -4, ("Z", "G"): -3, ("X", "L"): -2,
+    ("T", "M"): -1, ("Z", "C"): -5, ("X", "H"): -2, ("D", "R"): -2,
+    ("B", "W"): -5, ("X", "D"): -2, ("Z", "K"): 1, ("F", "A"): -3,
+    ("Z", "W"): -4, ("F", "E"): -4, ("D", "N"): 1, ("B", "K"): -1,
+    ("X", "X"): -2, ("F", "I"): -1, ("B", "G"): -1, ("X", "T"): -1,
+    ("F", "M"): -1, ("B", "C"): -4, ("Z", "I"): -4, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -3, ("W", "E"): -4, ("Q", "R"): 1,
+    ("N", "N"): 7, ("W", "M"): -2, ("Q", "C"): -4, ("W", "I"): -3,
+    ("S", "C"): -2, ("L", "A"): -2, ("S", "G"): -1, ("L", "E"): -4,
+    ("W", "Q"): -3, ("H", "G"): -3, ("S", "K"): -1, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -5, ("Y", "N"): -3, ("G", "Q"): -3,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 0, ("G", "E"): -3,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -3,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -5, ("V", "F"): -1,
+    ("T", "A"): 0, ("T", "P"): -2, ("B", "P"): -3, ("T", "E"): -1,
+    ("V", "N"): -4, ("P", "G"): -3, ("M", "A"): -2, ("K", "H"): -1,
+    ("V", "R"): -3, ("P", "C"): -4, ("M", "E"): -3, ("K", "L"): -3,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -5,
+    ("P", "K"): -2, ("M", "M"): 7, ("K", "D"): -1, ("I", "C"): -2,
+    ("Z", "D"): 1, ("F", "R"): -4, ("X", "K"): -1, ("Q", "D"): -1,
+    ("X", "G"): -2, ("Z", "L"): -4, ("X", "C"): -3, ("Z", "H"): 0,
+    ("B", "L"): -5, ("B", "H"): -1, ("F", "F"): 7, ("X", "W"): -3,
+    ("B", "D"): 4, ("D", "A"): -2, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -1, ("W", "D"): -6, ("V", "Y"): -2,
+    ("W", "L"): -3, ("H", "R"): 0, ("W", "H"): -3, ("H", "N"): 0,
+    ("W", "T"): -4, ("T", "T"): 5, ("S", "F"): -3, ("W", "P"): -5,
+    ("L", "D"): -5, ("B", "I"): -5, ("L", "H"): -3, ("S", "N"): 0,
+    ("B", "T"): -1, ("L", "L"): 4, ("Y", "K"): -3, ("E", "Q"): 2,
+    ("Y", "G"): -5, ("Z", "S"): -1, ("Y", "C"): -3, ("G", "D"): -2,
+    ("B", "V"): -4, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 6,
+    ("Y", "S"): -2, ("C", "N"): -4, ("V", "C"): -1, ("T", "H"): -2,
+    ("P", "R"): -2, ("V", "G"): -4, ("T", "L"): -2, ("V", "K"): -3,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -4, ("T", "D"): -2,
+    ("P", "F"): -4, ("I", "N"): -4, ("K", "I"): -3, ("M", "D"): -4,
+    ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -3, ("P", "N"): -3,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 0, ("Z", "E"): 4,
+    ("X", "N"): -2, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -4, ("B", "Q"): -1, ("X", "B"): -2, ("B", "M"): -4,
+    ("F", "C"): -3, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -4,
+    ("B", "E"): 0, ("X", "V"): -1, ("F", "K"): -4, ("B", "A"): -2,
+    ("X", "R"): -2, ("D", "D"): 7, ("W", "G"): -4, ("Z", "F"): -4,
+    ("S", "Q"): -1, ("W", "C"): -4, ("W", "K"): -5, ("H", "Q"): 1,
+    ("L", "C"): -2, ("W", "N"): -5, ("S", "A"): 1, ("L", "G"): -5,
+    ("W", "S"): -4, ("S", "E"): -1, ("H", "E"): -1, ("S", "I"): -3,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -2, ("Y", "H"): 2,
+    ("Y", "D"): -4, ("E", "R"): -1, ("X", "P"): -2, ("G", "G"): 6,
+    ("G", "C"): -4, ("E", "N"): -1, ("Y", "T"): -2, ("Y", "P"): -4,
+    ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -2, ("T", "C"): -2,
+    ("V", "H"): -4, ("T", "G"): -2, ("I", "Q"): -4, ("Z", "T"): -1,
+    ("C", "R"): -4, ("V", "P"): -3, ("P", "E"): -2, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -4,
+    ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -2, ("P", "I"): -4, ("R", "R"): 6, ("X", "M"): -1,
+    ("L", "I"): 1, ("X", "I"): -2, ("Z", "B"): 0, ("X", "E"): -1,
+    ("Z", "N"): -1, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 4,
+    ("F", "D"): -4, ("X", "Y"): -2, ("Z", "R"): 0, ("F", "H"): -2,
+    ("B", "F"): -4, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum90.cmp
+blosum90 = {
+    ("W", "F"): 0, ("L", "R"): -3, ("S", "P"): -2, ("V", "T"): -1,
+    ("Q", "Q"): 7, ("N", "A"): -2, ("Z", "Y"): -3, ("W", "R"): -4,
+    ("Q", "A"): -1, ("S", "D"): -1, ("H", "H"): 8, ("S", "H"): -2,
+    ("H", "D"): -2, ("L", "N"): -4, ("W", "A"): -4, ("Y", "M"): -2,
+    ("G", "R"): -3, ("Y", "I"): -2, ("Y", "E"): -4, ("B", "Y"): -4,
+    ("Y", "A"): -3, ("V", "D"): -5, ("B", "S"): 0, ("Y", "Y"): 8,
+    ("G", "N"): -1, ("E", "C"): -6, ("Y", "Q"): -3, ("Z", "Z"): 4,
+    ("V", "A"): -1, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -2, ("V", "M"): 0, ("T", "F"): -3, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -3, ("I", "H"): -4, ("I", "D"): -5,
+    ("T", "R"): -2, ("P", "L"): -4, ("K", "G"): -2, ("M", "N"): -3,
+    ("P", "H"): -3, ("F", "Q"): -4, ("Z", "G"): -3, ("X", "L"): -2,
+    ("T", "M"): -1, ("Z", "C"): -5, ("X", "H"): -2, ("D", "R"): -3,
+    ("B", "W"): -6, ("X", "D"): -2, ("Z", "K"): 1, ("F", "A"): -3,
+    ("Z", "W"): -4, ("F", "E"): -5, ("D", "N"): 1, ("B", "K"): -1,
+    ("X", "X"): -2, ("F", "I"): -1, ("B", "G"): -2, ("X", "T"): -1,
+    ("F", "M"): -1, ("B", "C"): -4, ("Z", "I"): -4, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -3, ("W", "E"): -5, ("Q", "R"): 1,
+    ("N", "N"): 7, ("W", "M"): -2, ("Q", "C"): -4, ("W", "I"): -4,
+    ("S", "C"): -2, ("L", "A"): -2, ("S", "G"): -1, ("L", "E"): -4,
+    ("W", "Q"): -3, ("H", "G"): -3, ("S", "K"): -1, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -5, ("Y", "N"): -3, ("G", "Q"): -3,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 0, ("G", "E"): -3,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -3,
+    ("M", "Q"): 0, ("T", "I"): -1, ("C", "D"): -5, ("V", "F"): -2,
+    ("T", "A"): 0, ("T", "P"): -2, ("B", "P"): -3, ("T", "E"): -1,
+    ("V", "N"): -4, ("P", "G"): -3, ("M", "A"): -2, ("K", "H"): -1,
+    ("V", "R"): -3, ("P", "C"): -4, ("M", "E"): -3, ("K", "L"): -3,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -5,
+    ("P", "K"): -2, ("M", "M"): 7, ("K", "D"): -1, ("I", "C"): -2,
+    ("Z", "D"): 0, ("F", "R"): -4, ("X", "K"): -1, ("Q", "D"): -1,
+    ("X", "G"): -2, ("Z", "L"): -4, ("X", "C"): -3, ("Z", "H"): 0,
+    ("B", "L"): -5, ("B", "H"): -1, ("F", "F"): 7, ("X", "W"): -3,
+    ("B", "D"): 4, ("D", "A"): -3, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -1, ("W", "D"): -6, ("V", "Y"): -3,
+    ("W", "L"): -3, ("H", "R"): 0, ("W", "H"): -3, ("H", "N"): 0,
+    ("W", "T"): -4, ("T", "T"): 6, ("S", "F"): -3, ("W", "P"): -5,
+    ("L", "D"): -5, ("B", "I"): -5, ("L", "H"): -4, ("S", "N"): 0,
+    ("B", "T"): -1, ("L", "L"): 5, ("Y", "K"): -3, ("E", "Q"): 2,
+    ("Y", "G"): -5, ("Z", "S"): -1, ("Y", "C"): -4, ("G", "D"): -2,
+    ("B", "V"): -4, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 6,
+    ("Y", "S"): -3, ("C", "N"): -4, ("V", "C"): -2, ("T", "H"): -2,
+    ("P", "R"): -3, ("V", "G"): -5, ("T", "L"): -2, ("V", "K"): -3,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -4, ("T", "D"): -2,
+    ("P", "F"): -4, ("I", "N"): -4, ("K", "I"): -4, ("M", "D"): -4,
+    ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -3, ("P", "N"): -3,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 0, ("Z", "E"): 4,
+    ("X", "N"): -2, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -4, ("B", "Q"): -1, ("X", "B"): -2, ("B", "M"): -4,
+    ("F", "C"): -3, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -5,
+    ("B", "E"): 0, ("X", "V"): -2, ("F", "K"): -4, ("B", "A"): -2,
+    ("X", "R"): -2, ("D", "D"): 7, ("W", "G"): -4, ("Z", "F"): -4,
+    ("S", "Q"): -1, ("W", "C"): -4, ("W", "K"): -5, ("H", "Q"): 1,
+    ("L", "C"): -2, ("W", "N"): -5, ("S", "A"): 1, ("L", "G"): -5,
+    ("W", "S"): -4, ("S", "E"): -1, ("H", "E"): -1, ("S", "I"): -3,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -2, ("Y", "H"): 1,
+    ("Y", "D"): -4, ("E", "R"): -1, ("X", "P"): -2, ("G", "G"): 6,
+    ("G", "C"): -4, ("E", "N"): -1, ("Y", "T"): -2, ("Y", "P"): -4,
+    ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -2, ("T", "C"): -2,
+    ("V", "H"): -4, ("T", "G"): -3, ("I", "Q"): -4, ("Z", "T"): -1,
+    ("C", "R"): -5, ("V", "P"): -3, ("P", "E"): -2, ("M", "C"): -2,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -4,
+    ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -2, ("P", "I"): -4, ("R", "R"): 6, ("X", "M"): -1,
+    ("L", "I"): 1, ("X", "I"): -2, ("Z", "B"): 0, ("X", "E"): -2,
+    ("Z", "N"): -1, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 4,
+    ("F", "D"): -5, ("X", "Y"): -2, ("Z", "R"): 0, ("F", "H"): -2,
+    ("B", "F"): -4, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/blosum95.cmp
+blosum95 = {
+    ("W", "F"): 0, ("L", "R"): -3, ("S", "P"): -2, ("V", "T"): -1,
+    ("Q", "Q"): 7, ("N", "A"): -2, ("Z", "Y"): -4, ("W", "R"): -4,
+    ("Q", "A"): -1, ("S", "D"): -1, ("H", "H"): 9, ("S", "H"): -2,
+    ("H", "D"): -2, ("L", "N"): -5, ("W", "A"): -4, ("Y", "M"): -3,
+    ("G", "R"): -4, ("Y", "I"): -2, ("Y", "E"): -4, ("B", "Y"): -4,
+    ("Y", "A"): -3, ("V", "D"): -5, ("B", "S"): -1, ("Y", "Y"): 8,
+    ("G", "N"): -1, ("E", "C"): -6, ("Y", "Q"): -3, ("Z", "Z"): 4,
+    ("V", "A"): -1, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): -1, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -3,
+    ("Z", "P"): -2, ("V", "M"): 0, ("T", "F"): -3, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -3, ("I", "H"): -4, ("I", "D"): -5,
+    ("T", "R"): -2, ("P", "L"): -4, ("K", "G"): -3, ("M", "N"): -3,
+    ("P", "H"): -3, ("F", "Q"): -4, ("Z", "G"): -3, ("X", "L"): -2,
+    ("T", "M"): -1, ("Z", "C"): -5, ("X", "H"): -2, ("D", "R"): -3,
+    ("B", "W"): -6, ("X", "D"): -2, ("Z", "K"): 0, ("F", "A"): -3,
+    ("Z", "W"): -4, ("F", "E"): -5, ("D", "N"): 1, ("B", "K"): -1,
+    ("X", "X"): -2, ("F", "I"): -1, ("B", "G"): -2, ("X", "T"): -1,
+    ("F", "M"): -1, ("B", "C"): -4, ("Z", "I"): -4, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -3, ("W", "E"): -5, ("Q", "R"): 0,
+    ("N", "N"): 7, ("W", "M"): -2, ("Q", "C"): -4, ("W", "I"): -4,
+    ("S", "C"): -2, ("L", "A"): -2, ("S", "G"): -1, ("L", "E"): -4,
+    ("W", "Q"): -3, ("H", "G"): -3, ("S", "K"): -1, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -5, ("Y", "N"): -3, ("G", "Q"): -3,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 0, ("G", "E"): -3,
+    ("G", "A"): -1, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -3,
+    ("M", "Q"): -1, ("T", "I"): -2, ("C", "D"): -5, ("V", "F"): -2,
+    ("T", "A"): 0, ("T", "P"): -2, ("B", "P"): -3, ("T", "E"): -2,
+    ("V", "N"): -4, ("P", "G"): -4, ("M", "A"): -2, ("K", "H"): -1,
+    ("V", "R"): -4, ("P", "C"): -5, ("M", "E"): -3, ("K", "L"): -3,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -6,
+    ("P", "K"): -2, ("M", "M"): 7, ("K", "D"): -2, ("I", "C"): -2,
+    ("Z", "D"): 0, ("F", "R"): -4, ("X", "K"): -1, ("Q", "D"): -1,
+    ("X", "G"): -3, ("Z", "L"): -4, ("X", "C"): -3, ("Z", "H"): 0,
+    ("B", "L"): -5, ("B", "H"): -1, ("F", "F"): 7, ("X", "W"): -4,
+    ("B", "D"): 4, ("D", "A"): -3, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -2, ("W", "D"): -6, ("V", "Y"): -3,
+    ("W", "L"): -3, ("H", "R"): -1, ("W", "H"): -3, ("H", "N"): 0,
+    ("W", "T"): -4, ("T", "T"): 6, ("S", "F"): -3, ("W", "P"): -5,
+    ("L", "D"): -5, ("B", "I"): -5, ("L", "H"): -4, ("S", "N"): 0,
+    ("B", "T"): -1, ("L", "L"): 5, ("Y", "K"): -3, ("E", "Q"): 2,
+    ("Y", "G"): -5, ("Z", "S"): -1, ("Y", "C"): -4, ("G", "D"): -2,
+    ("B", "V"): -5, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 6,
+    ("Y", "S"): -3, ("C", "N"): -4, ("V", "C"): -2, ("T", "H"): -2,
+    ("P", "R"): -3, ("V", "G"): -5, ("T", "L"): -2, ("V", "K"): -3,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -4, ("T", "D"): -2,
+    ("P", "F"): -5, ("I", "N"): -4, ("K", "I"): -4, ("M", "D"): -5,
+    ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -3, ("P", "N"): -3,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 0, ("Z", "E"): 4,
+    ("X", "N"): -2, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -5, ("B", "Q"): -1, ("X", "B"): -2, ("B", "M"): -4,
+    ("F", "C"): -3, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -5,
+    ("B", "E"): 0, ("X", "V"): -2, ("F", "K"): -4, ("B", "A"): -3,
+    ("X", "R"): -2, ("D", "D"): 7, ("W", "G"): -5, ("Z", "F"): -4,
+    ("S", "Q"): -1, ("W", "C"): -4, ("W", "K"): -5, ("H", "Q"): 1,
+    ("L", "C"): -3, ("W", "N"): -5, ("S", "A"): 1, ("L", "G"): -5,
+    ("W", "S"): -4, ("S", "E"): -1, ("H", "E"): -1, ("S", "I"): -3,
+    ("H", "A"): -3, ("S", "M"): -3, ("Y", "L"): -2, ("Y", "H"): 1,
+    ("Y", "D"): -5, ("E", "R"): -1, ("X", "P"): -3, ("G", "G"): 6,
+    ("G", "C"): -5, ("E", "N"): -1, ("Y", "T"): -3, ("Y", "P"): -5,
+    ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -2, ("T", "C"): -2,
+    ("V", "H"): -4, ("T", "G"): -3, ("I", "Q"): -4, ("Z", "T"): -2,
+    ("C", "R"): -5, ("V", "P"): -4, ("P", "E"): -2, ("M", "C"): -3,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -4,
+    ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -2, ("P", "I"): -4, ("R", "R"): 7, ("X", "M"): -2,
+    ("L", "I"): 1, ("X", "I"): -2, ("Z", "B"): 0, ("X", "E"): -2,
+    ("Z", "N"): -1, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 4,
+    ("F", "D"): -5, ("X", "Y"): -2, ("Z", "R"): -1, ("F", "H"): -2,
+    ("B", "F"): -5, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/feng.cmp
+feng = {
+    ("W", "F"): 3, ("L", "R"): 2, ("I", "I"): 6, ("Q", "Q"): 6,
+    ("W", "N"): 0, ("V", "I"): 5, ("H", "T"): 2, ("H", "P"): 3,
+    ("W", "V"): 3, ("Q", "E"): 4, ("W", "R"): 2, ("Q", "A"): 3,
+    ("H", "H"): 6, ("H", "D"): 3, ("L", "N"): 1, ("Y", "M"): 2,
+    ("Y", "I"): 3, ("Y", "E"): 1, ("E", "S"): 3, ("Y", "A"): 2,
+    ("Y", "Y"): 6, ("T", "C"): 2, ("E", "C"): 0, ("Y", "Q"): 2,
+    ("E", "G"): 4, ("V", "A"): 5, ("C", "C"): 6, ("M", "R"): 2,
+    ("P", "T"): 4, ("V", "E"): 4, ("P", "P"): 6, ("I", "T"): 3,
+    ("K", "S"): 3, ("R", "G"): 3, ("I", "P"): 2, ("R", "C"): 2,
+    ("A", "T"): 5, ("K", "K"): 6, ("A", "P"): 5, ("V", "M"): 4,
+    ("I", "D"): 1, ("K", "C"): 0, ("K", "G"): 2, ("R", "S"): 3,
+    ("F", "Q"): 1, ("F", "A"): 2, ("V", "V"): 6, ("M", "N"): 1,
+    ("F", "E"): 0, ("D", "N"): 5, ("F", "I"): 4, ("F", "M"): 2,
+    ("M", "S"): 1, ("S", "S"): 6, ("L", "Q"): 2, ("W", "E"): 1,
+    ("W", "A"): 2, ("W", "M"): 3, ("H", "S"): 3, ("W", "I"): 2,
+    ("S", "C"): 4, ("L", "A"): 2, ("L", "E"): 1, ("W", "Q"): 1,
+    ("H", "G"): 1, ("Q", "N"): 3, ("H", "C"): 2, ("L", "M"): 5,
+    ("W", "Y"): 3, ("Y", "N"): 3, ("E", "P"): 3, ("Y", "F"): 5,
+    ("E", "T"): 3, ("A", "A"): 6, ("I", "N"): 2, ("G", "A"): 5,
+    ("Y", "V"): 3, ("E", "D"): 5, ("W", "H"): 1, ("Y", "R"): 1,
+    ("M", "Q"): 2, ("P", "S"): 4, ("R", "H"): 4, ("A", "C"): 2,
+    ("R", "D"): 2, ("K", "P"): 2, ("L", "D"): 1, ("K", "T"): 4,
+    ("V", "N"): 2, ("M", "A"): 2, ("K", "H"): 3, ("V", "R"): 2,
+    ("P", "C"): 2, ("M", "E"): 1, ("A", "S"): 5, ("T", "T"): 6,
+    ("R", "T"): 3, ("I", "G"): 2, ("R", "P"): 3, ("K", "D"): 3,
+    ("I", "C"): 2, ("F", "R"): 1, ("F", "V"): 4, ("L", "C"): 2,
+    ("F", "F"): 6, ("D", "A"): 4, ("F", "N"): 1, ("W", "D"): 0,
+    ("L", "P"): 3, ("Q", "S"): 3, ("N", "C"): 2, ("N", "G"): 3,
+    ("H", "N"): 4, ("W", "T"): 1, ("Q", "G"): 2, ("W", "P"): 2,
+    ("Q", "C"): 1, ("N", "S"): 5, ("L", "H"): 3, ("L", "L"): 6,
+    ("G", "T"): 2, ("M", "M"): 6, ("G", "P"): 3, ("Y", "K"): 1,
+    ("Y", "G"): 2, ("Y", "C"): 3, ("E", "A"): 4, ("E", "E"): 6,
+    ("Y", "S"): 3, ("M", "P"): 2, ("V", "C"): 2, ("M", "T"): 3,
+    ("V", "G"): 4, ("R", "E"): 2, ("V", "K"): 3, ("K", "Q"): 4,
+    ("R", "A"): 2, ("I", "R"): 2, ("N", "A"): 3, ("V", "S"): 2,
+    ("M", "D"): 0, ("M", "H"): 1, ("K", "A"): 3, ("R", "Q"): 3,
+    ("K", "E"): 4, ("F", "S"): 3, ("I", "K"): 2, ("D", "P"): 2,
+    ("D", "T"): 2, ("I", "M"): 4, ("F", "C"): 3, ("W", "L"): 4,
+    ("F", "G"): 1, ("F", "K"): 0, ("F", "T"): 1, ("D", "D"): 6,
+    ("Q", "T"): 3, ("W", "G"): 3, ("Q", "P"): 3, ("W", "C"): 3,
+    ("W", "K"): 1, ("H", "Q"): 4, ("Q", "D"): 4, ("W", "W"): 6,
+    ("V", "L"): 5, ("L", "G"): 2, ("W", "S"): 2, ("L", "K"): 2,
+    ("N", "P"): 2, ("H", "E"): 2, ("N", "T"): 4, ("H", "A"): 2,
+    ("Y", "L"): 3, ("Y", "H"): 3, ("G", "S"): 5, ("Y", "D"): 2,
+    ("V", "Q"): 2, ("L", "T"): 2, ("G", "G"): 6, ("G", "C"): 3,
+    ("E", "N"): 3, ("Y", "T"): 2, ("Y", "P"): 2, ("R", "N"): 2,
+    ("V", "D"): 3, ("K", "R"): 5, ("V", "H"): 1, ("I", "Q"): 1,
+    ("V", "P"): 3, ("M", "C"): 2, ("K", "N"): 4, ("V", "T"): 3,
+    ("M", "G"): 1, ("T", "S"): 5, ("I", "E"): 1, ("M", "K"): 2,
+    ("I", "A"): 2, ("N", "N"): 6, ("R", "R"): 6, ("F", "P"): 2,
+    ("L", "I"): 5, ("I", "S"): 2, ("D", "S"): 3, ("L", "S"): 2,
+    ("I", "H"): 1, ("F", "D"): 1, ("D", "C"): 1, ("F", "H"): 2,
+    ("D", "G"): 4, ("F", "L"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/fitch.cmp
+fitch = {
+    ("W", "F"): 2, ("U", "I"): 1, ("W", "T"): 0, ("R", "R"): 3,
+    ("N", "M"): 1, ("U", "M"): 2, ("U", "O"): 1, ("N", "A"): 2,
+    ("U", "A"): 1, ("N", "C"): 1, ("U", "C"): 1, ("N", "E"): 2,
+    ("U", "E"): 1, ("W", "H"): 1, ("W", "R"): 1, ("W", "V"): 2,
+    ("Q", "E"): 1, ("S", "S"): 3, ("T", "T"): 3, ("S", "F"): 2,
+    ("Q", "A"): 1, ("Q", "C"): 1, ("H", "F"): 1, ("S", "H"): 1,
+    ("L", "H"): 1, ("S", "N"): 2, ("Q", "I"): 2, ("S", "L"): 2,
+    ("L", "L"): 3, ("Y", "M"): 1, ("M", "M"): 3, ("Y", "I"): 1,
+    ("W", "N"): 2, ("Y", "R"): 2, ("Y", "E"): 1, ("O", "O"): 3,
+    ("Y", "A"): 2, ("Y", "C"): 2, ("U", "S"): 2, ("Y", "U"): 2,
+    ("Q", "Q"): 3, ("E", "A"): 1, ("N", "L"): 1, ("E", "C"): 1,
+    ("W", "W"): 3, ("E", "E"): 3, ("Y", "S"): 2, ("V", "A"): 2,
+    ("C", "C"): 3, ("V", "C"): 2, ("T", "H"): 0, ("V", "E"): 1,
+    ("T", "N"): 0, ("O", "I"): 1, ("R", "E"): 2, ("O", "E"): 1,
+    ("R", "A"): 1, ("O", "C"): 2, ("R", "C"): 2, ("O", "A"): 2,
+    ("V", "Q"): 1, ("V", "M"): 2, ("V", "S"): 1, ("Q", "O"): 2,
+    ("I", "H"): 1, ("M", "F"): 1, ("R", "Q"): 2, ("M", "H"): 0,
+    ("O", "F"): 2, ("T", "F"): 1, ("M", "I"): 2, ("M", "L"): 1,
+    ("V", "T"): 1, ("Q", "H"): 2, ("Q", "N"): 1, ("T", "M"): 1,
+    ("W", "M"): 2, ("Y", "Y"): 3, ("T", "S"): 2, ("R", "O"): 1,
+    ("T", "R"): 2, ("H", "H"): 3, ("F", "A"): 1, ("F", "C"): 2,
+    ("F", "E"): 1, ("W", "L"): 1, ("T", "L"): 1, ("U", "R"): 2,
+    ("U", "H"): 2, ("N", "H"): 2, ("U", "Q"): 2, ("W", "E"): 2,
+    ("U", "L"): 1, ("W", "C"): 1, ("U", "N"): 1, ("W", "A"): 1,
+    ("W", "O"): 1, ("I", "F"): 0, ("U", "F"): 2, ("N", "F"): 1,
+    ("L", "C"): 1, ("S", "C"): 2, ("L", "A"): 2, ("S", "A"): 1,
+    ("W", "S"): 2, ("L", "E"): 2, ("S", "E"): 2, ("Q", "L"): 1,
+    ("H", "E"): 1, ("S", "I"): 1, ("H", "C"): 1, ("S", "O"): 2,
+    ("H", "A"): 2, ("S", "M"): 1, ("Y", "L"): 2, ("Y", "N"): 1,
+    ("Y", "H"): 1, ("O", "M"): 0, ("Y", "Q"): 1, ("Y", "F"): 1,
+    ("W", "I"): 1, ("C", "A"): 1, ("R", "I"): 2, ("Y", "O"): 1,
+    ("Q", "M"): 1, ("S", "Q"): 2, ("U", "T"): 2, ("Y", "T"): 2,
+    ("Y", "V"): 2, ("O", "L"): 1, ("R", "N"): 1, ("A", "A"): 3,
+    ("N", "I"): 2, ("R", "L"): 1, ("T", "I"): 1, ("L", "F"): 1,
+    ("T", "O"): 1, ("R", "H"): 2, ("O", "H"): 2, ("V", "F"): 2,
+    ("T", "C"): 2, ("V", "H"): 1, ("T", "A"): 0, ("R", "F"): 1,
+    ("V", "L"): 2, ("T", "E"): 1, ("V", "N"): 1, ("M", "A"): 0,
+    ("Q", "F"): 1, ("M", "C"): 0, ("W", "U"): 2, ("I", "I"): 3,
+    ("V", "V"): 3, ("O", "N"): 2, ("I", "E"): 2, ("T", "Q"): 2,
+    ("I", "A"): 1, ("N", "N"): 3, ("I", "C"): 0, ("S", "R"): 2,
+    ("V", "R"): 1, ("L", "I"): 1, ("V", "I"): 1, ("R", "M"): 2,
+    ("Y", "W"): 1, ("M", "E"): 2, ("V", "U"): 2, ("W", "Q"): 0,
+    ("U", "U"): 3, ("V", "O"): 1, ("F", "F"): 3
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/genetic.cmp
+genetic = {
+    ("W", "F"): 0.0, ("L", "R"): -0.4, ("I", "I"): 4.1, ("Q", "Q"): 5.5,
+    ("W", "N"): -3.0, ("V", "I"): 1.0, ("H", "T"): -1.8, ("H", "P"): 0.7,
+    ("W", "V"): -2.1, ("Q", "E"): 2.0, ("W", "R"): 1.8, ("Q", "A"): -2.1,
+    ("H", "H"): 4.7, ("H", "D"): 1.7, ("L", "N"): -2.2, ("Y", "M"): -2.9,
+    ("Y", "I"): -1.6, ("Y", "E"): -0.9, ("E", "S"): -2.8, ("Y", "A"): -2.4,
+    ("Y", "Y"): 6.5, ("T", "C"): -1.9, ("E", "C"): -3.0, ("Y", "Q"): -0.8,
+    ("E", "G"): 1.4, ("V", "A"): 1.0, ("C", "C"): 5.5, ("M", "R"): -0.4,
+    ("P", "T"): 1.1, ("V", "E"): 1.3, ("P", "P"): 3.8, ("I", "T"): 0.8,
+    ("K", "S"): -1.5, ("R", "G"): 0.8, ("I", "P"): -1.6, ("R", "C"): 0.7,
+    ("A", "T"): 0.9, ("K", "K"): 5.6, ("A", "P"): 0.8, ("V", "M"): 1.0,
+    ("I", "D"): -2.1, ("K", "C"): -3.2, ("K", "G"): -2.2, ("R", "S"): 0.3,
+    ("F", "Q"): -2.1, ("F", "A"): -2.4, ("V", "V"): 4.1, ("M", "N"): 0.1,
+    ("F", "E"): -2.9, ("D", "N"): 1.7, ("F", "I"): 1.3, ("F", "M"): 0.5,
+    ("M", "S"): -1.3, ("S", "S"): 2.6, ("L", "Q"): 0.1, ("W", "E"): -3.2,
+    ("W", "A"): -2.2, ("W", "M"): -2.0, ("H", "S"): -1.6, ("W", "I"): -2.2,
+    ("S", "C"): 1.5, ("L", "A"): -2.3, ("L", "E"): -2.5, ("W", "Q"): -2.3,
+    ("H", "G"): -2.2, ("Q", "N"): 0.4, ("H", "C"): -1.6, ("L", "M"): 1.5,
+    ("W", "Y"): -0.5, ("Y", "N"): 2.5, ("E", "P"): -2.1, ("Y", "F"): 2.0,
+    ("E", "T"): -2.1, ("A", "A"): 4.0, ("I", "N"): 0.9, ("G", "A"): 1.2,
+    ("Y", "V"): -2.2, ("E", "D"): 3.8, ("W", "H"): -2.1, ("Y", "R"): -1.9,
+    ("M", "Q"): -1.2, ("P", "S"): 0.4, ("R", "H"): 3.6, ("A", "C"): -1.9,
+    ("R", "D"): -2.3, ("K", "P"): -1.5, ("L", "D"): -2.4, ("K", "T"): 1.0,
+    ("V", "N"): -2.2, ("M", "A"): -2.0, ("K", "H"): 0.6, ("V", "R"): -2.1,
+    ("P", "C"): -1.9, ("M", "E"): -1.8, ("A", "S"): 0.1, ("T", "T"): 4.0,
+    ("R", "T"): -0.6, ("I", "G"): -2.5, ("R", "P"): 0.3, ("K", "D"): 0.3,
+    ("I", "C"): -1.9, ("F", "R"): -1.5, ("F", "V"): 1.0, ("L", "C"): -1.3,
+    ("F", "F"): 4.5, ("D", "A"): 1.0, ("F", "N"): -1.3, ("W", "D"): -2.9,
+    ("L", "P"): 0.0, ("Q", "S"): -2.3, ("N", "C"): -1.5, ("N", "G"): -2.6,
+    ("H", "N"): 1.8, ("W", "T"): -2.2, ("Q", "G"): -2.1, ("W", "P"): -1.6,
+    ("Q", "C"): -3.1, ("N", "S"): -0.3, ("L", "H"): -0.1, ("L", "L"): 3.4,
+    ("G", "T"): -2.1, ("M", "M"): 5.4, ("G", "P"): -1.8, ("Y", "K"): -0.8,
+    ("Y", "G"): -1.8, ("Y", "C"): 2.6, ("E", "A"): 1.3, ("E", "E"): 5.7,
+    ("Y", "S"): 0.3, ("M", "P"): -1.4, ("V", "C"): -2.2, ("M", "T"): 0.7,
+    ("V", "G"): 1.1, ("R", "E"): -2.0, ("V", "K"): -2.1, ("K", "Q"): 2.2,
+    ("R", "A"): -1.6, ("I", "R"): -1.2, ("N", "A"): -1.7, ("V", "S"): -2.2,
+    ("M", "D"): -2.5, ("M", "H"): -1.8, ("K", "A"): -1.9, ("R", "Q"): 0.3,
+    ("K", "E"): 2.0, ("F", "S"): 0.0, ("I", "K"): 0.7, ("D", "P"): -2.2,
+    ("D", "T"): -2.1, ("I", "M"): 3.3, ("F", "C"): 1.8, ("W", "L"): -0.3,
+    ("F", "G"): -1.9, ("F", "K"): -2.8, ("F", "T"): -2.1, ("D", "D"): 4.8,
+    ("Q", "T"): -1.7, ("W", "G"): 1.4, ("Q", "P"): 1.0, ("W", "C"): 4.1,
+    ("W", "K"): -3.0, ("H", "Q"): 3.6, ("Q", "D"): 0.3, ("W", "W"): 7.5,
+    ("V", "L"): 1.1, ("L", "G"): -2.2, ("W", "S"): 0.8, ("L", "K"): -2.0,
+    ("N", "P"): -1.6, ("H", "E"): 0.3, ("N", "T"): 0.9, ("H", "A"): -2.1,
+    ("Y", "L"): -1.6, ("Y", "H"): 2.3, ("G", "S"): -0.6, ("Y", "D"): 2.3,
+    ("V", "Q"): -2.0, ("L", "T"): -1.9, ("G", "G"): 4.2, ("G", "C"): 1.0,
+    ("E", "N"): 0.3, ("Y", "T"): -2.1, ("Y", "P"): -2.3, ("R", "N"): -1.5,
+    ("V", "D"): 1.0, ("K", "R"): -0.2, ("V", "H"): -2.1, ("I", "Q"): -1.9,
+    ("V", "P"): -2.1, ("M", "C"): -2.7, ("K", "N"): 3.5, ("V", "T"): -2.2,
+    ("M", "G"): -2.3, ("T", "S"): 1.0, ("I", "E"): -2.3, ("M", "K"): 1.6,
+    ("I", "A"): -1.8, ("N", "N"): 4.7, ("R", "R"): 2.9, ("F", "P"): -1.8,
+    ("L", "I"): 1.2, ("I", "S"): -0.5, ("D", "S"): -2.1, ("L", "S"): -1.2,
+    ("I", "H"): -1.8, ("F", "D"): -1.7, ("D", "C"): -1.6, ("F", "H"): -1.1,
+    ("D", "G"): 1.1, ("F", "L"): 2.2
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/gonnet.cmp
+gonnet = {
+    ("W", "F"): 3.6, ("L", "R"): -2.2, ("I", "I"): 4.0, ("Q", "Q"): 2.7,
+    ("W", "N"): -3.6, ("V", "I"): 3.1, ("H", "T"): -0.3, ("H", "P"): -1.1,
+    ("W", "V"): -2.6, ("Q", "E"): 1.7, ("W", "R"): -1.6, ("Q", "A"): -0.2,
+    ("H", "H"): 6.0, ("H", "D"): 0.4, ("L", "N"): -3.0, ("Y", "M"): -0.2,
+    ("Y", "I"): -0.7, ("Y", "E"): -2.7, ("E", "S"): 0.2, ("Y", "A"): -2.2,
+    ("Y", "Y"): 7.8, ("T", "C"): -0.5, ("E", "C"): -3.0, ("Y", "Q"): -1.7,
+    ("E", "G"): -0.8, ("V", "A"): 0.1, ("C", "C"): 11.5, ("M", "R"): -1.7,
+    ("P", "T"): 0.1, ("V", "E"): -1.9, ("P", "P"): 7.6, ("I", "T"): -0.6,
+    ("K", "S"): 0.1, ("R", "G"): -1.0, ("I", "P"): -2.6, ("R", "C"): -2.2,
+    ("A", "T"): 0.6, ("K", "K"): 3.2, ("A", "P"): 0.3, ("V", "M"): 1.6,
+    ("I", "D"): -3.8, ("K", "C"): -2.8, ("K", "G"): -1.1, ("R", "S"): -0.2,
+    ("F", "Q"): -2.6, ("F", "A"): -2.3, ("V", "V"): 3.4, ("M", "N"): -2.2,
+    ("F", "E"): -3.9, ("D", "N"): 2.2, ("F", "I"): 1.0, ("F", "M"): 1.6,
+    ("M", "S"): -1.4, ("S", "S"): 2.2, ("L", "Q"): -1.6, ("W", "E"): -4.3,
+    ("W", "A"): -3.6, ("W", "M"): -1.0, ("H", "S"): -0.2, ("W", "I"): -1.8,
+    ("S", "C"): 0.1, ("L", "A"): -1.2, ("L", "E"): -2.8, ("W", "Q"): -2.7,
+    ("H", "G"): -1.4, ("Q", "N"): 0.7, ("H", "C"): -1.3, ("L", "M"): 2.8,
+    ("W", "Y"): 4.1, ("Y", "N"): -1.4, ("E", "P"): -0.5, ("Y", "F"): 5.1,
+    ("E", "T"): -0.1, ("A", "A"): 2.4, ("I", "N"): -2.8, ("G", "A"): 0.5,
+    ("Y", "V"): -1.1, ("E", "D"): 2.7, ("W", "H"): -0.8, ("Y", "R"): -1.8,
+    ("M", "Q"): -1.0, ("P", "S"): 0.4, ("R", "H"): 0.6, ("A", "C"): 0.5,
+    ("R", "D"): -0.3, ("K", "P"): -0.6, ("L", "D"): -4.0, ("K", "T"): 0.1,
+    ("V", "N"): -2.2, ("M", "A"): -0.7, ("K", "H"): 0.6, ("V", "R"): -2.0,
+    ("P", "C"): -3.1, ("M", "E"): -2.0, ("A", "S"): 1.1, ("T", "T"): 2.5,
+    ("R", "T"): -0.2, ("I", "G"): -4.5, ("R", "P"): -0.9, ("K", "D"): 0.5,
+    ("I", "C"): -1.1, ("F", "R"): -3.2, ("F", "V"): 0.1, ("L", "C"): -1.5,
+    ("F", "F"): 7.0, ("D", "A"): -0.3, ("F", "N"): -3.1, ("W", "D"): -5.2,
+    ("L", "P"): -2.3, ("Q", "S"): 0.2, ("N", "C"): -1.8, ("N", "G"): 0.4,
+    ("H", "N"): 1.2, ("W", "T"): -3.5, ("Q", "G"): -1.0, ("W", "P"): -5.0,
+    ("Q", "C"): -2.4, ("N", "S"): 0.9, ("L", "H"): -1.9, ("L", "L"): 4.0,
+    ("G", "T"): -1.1, ("M", "M"): 4.3, ("G", "P"): -1.6, ("Y", "K"): -2.1,
+    ("Y", "G"): -4.0, ("Y", "C"): -0.5, ("E", "A"): 0.0, ("E", "E"): 3.6,
+    ("Y", "S"): -1.9, ("M", "P"): -2.4, ("V", "C"): 0.0, ("M", "T"): -0.6,
+    ("V", "G"): -3.3, ("R", "E"): 0.4, ("V", "K"): -1.7, ("K", "Q"): 1.5,
+    ("R", "A"): -0.6, ("I", "R"): -2.4, ("N", "A"): -0.3, ("V", "S"): -1.0,
+    ("M", "D"): -3.0, ("M", "H"): -1.3, ("K", "A"): -0.4, ("R", "Q"): 1.5,
+    ("K", "E"): 1.2, ("F", "S"): -2.8, ("I", "K"): -2.1, ("D", "P"): -0.7,
+    ("D", "T"): 0.0, ("I", "M"): 2.5, ("F", "C"): -0.8, ("W", "L"): -0.7,
+    ("F", "G"): -5.2, ("F", "K"): -3.3, ("F", "T"): -2.2, ("D", "D"): 4.7,
+    ("Q", "T"): 0.0, ("W", "G"): -4.0, ("Q", "P"): -0.2, ("W", "C"): -1.0,
+    ("W", "K"): -3.5, ("H", "Q"): 1.2, ("Q", "D"): 0.9, ("W", "W"): 14.2,
+    ("V", "L"): 1.8, ("L", "G"): -4.4, ("W", "S"): -3.3, ("L", "K"): -2.1,
+    ("N", "P"): -0.9, ("H", "E"): 0.4, ("N", "T"): 0.5, ("H", "A"): -0.8,
+    ("Y", "L"): 0.0, ("Y", "H"): 2.2, ("G", "S"): 0.4, ("Y", "D"): -2.8,
+    ("V", "Q"): -1.5, ("L", "T"): -1.3, ("G", "G"): 6.6, ("G", "C"): -2.0,
+    ("E", "N"): 0.9, ("Y", "T"): -1.9, ("Y", "P"): -3.1, ("R", "N"): 0.3,
+    ("V", "D"): -2.9, ("K", "R"): 2.7, ("V", "H"): -2.0, ("I", "Q"): -1.9,
+    ("V", "P"): -1.8, ("M", "C"): -0.9, ("K", "N"): 0.8, ("V", "T"): 0.0,
+    ("M", "G"): -3.5, ("T", "S"): 1.5, ("I", "E"): -2.7, ("M", "K"): -1.4,
+    ("I", "A"): -0.8, ("N", "N"): 3.8, ("R", "R"): 4.7, ("F", "P"): -3.8,
+    ("L", "I"): 2.8, ("I", "S"): -1.8, ("D", "S"): 0.5, ("L", "S"): -2.1,
+    ("I", "H"): -2.2, ("F", "D"): -4.5, ("D", "C"): -3.2, ("F", "H"): -0.1,
+    ("D", "G"): 0.1, ("F", "L"): 2.0
+}
+
+
+# from https://www.genome.jp/dbget-bin/www_bget?aaindex:GRAR740104,
+# but subtracted from 215
+# as per original reference now available at
+# https://web.archive.org/web/19991111011852/http://www.embl-heidelberg.de:80/~vogt/matrices/grant.cmp
+# which was found to have incorrect amino acid characters (issue 308)
+grant = {
+    ("A", "A"): 215, ("C", "A"): 20, ("C", "C"): 215, ("C", "D"): 61,
+    ("C", "N"): 76, ("C", "R"): 35, ("D", "A"): 89, ("D", "D"): 215,
+    ("D", "N"): 192, ("D", "R"): 119, ("E", "A"): 108, ("E", "C"): 45,
+    ("E", "D"): 170, ("E", "E"): 215, ("E", "N"): 173, ("E", "Q"): 186,
+    ("E", "R"): 161, ("F", "A"): 102, ("F", "C"): 10, ("F", "D"): 38,
+    ("F", "E"): 75, ("F", "F"): 215, ("F", "G"): 62, ("F", "H"): 115,
+    ("F", "I"): 194, ("F", "K"): 113, ("F", "L"): 193, ("F", "M"): 187,
+    ("F", "N"): 57, ("F", "Q"): 99, ("F", "R"): 118, ("G", "A"): 155,
+    ("G", "C"): 56, ("G", "D"): 121, ("G", "E"): 117, ("G", "G"): 215,
+    ("G", "N"): 135, ("G", "Q"): 128, ("G", "R"): 90, ("H", "A"): 129,
+    ("H", "C"): 41, ("H", "D"): 134, ("H", "E"): 175, ("H", "G"): 117,
+    ("H", "H"): 215, ("H", "N"): 147, ("H", "Q"): 191, ("H", "R"): 186,
+    ("I", "A"): 121, ("I", "C"): 17, ("I", "D"): 47, ("I", "E"): 81,
+    ("I", "G"): 80, ("I", "H"): 121, ("I", "I"): 215, ("I", "N"): 66,
+    ("I", "Q"): 106, ("I", "R"): 118, ("K", "A"): 109, ("K", "C"): 13,
+    ("K", "D"): 114, ("K", "E"): 159, ("K", "G"): 88, ("K", "H"): 183,
+    ("K", "I"): 113, ("K", "K"): 215, ("K", "L"): 108, ("K", "N"): 121,
+    ("K", "Q"): 162, ("K", "R"): 189, ("L", "A"): 119, ("L", "C"): 17,
+    ("L", "D"): 43, ("L", "E"): 77, ("L", "G"): 77, ("L", "H"): 116,
+    ("L", "I"): 210, ("L", "L"): 215, ("L", "N"): 62, ("L", "Q"): 102,
+    ("L", "R"): 113, ("M", "A"): 131, ("M", "C"): 19, ("M", "D"): 55,
+    ("M", "E"): 89, ("M", "G"): 88, ("M", "H"): 128, ("M", "I"): 205,
+    ("M", "K"): 120, ("M", "L"): 200, ("M", "M"): 215, ("M", "N"): 73,
+    ("M", "Q"): 114, ("M", "R"): 124, ("N", "A"): 104, ("N", "N"): 215,
+    ("N", "R"): 129, ("P", "A"): 188, ("P", "C"): 46, ("P", "D"): 107,
+    ("P", "E"): 122, ("P", "F"): 101, ("P", "G"): 173, ("P", "H"): 138,
+    ("P", "I"): 120, ("P", "K"): 112, ("P", "L"): 117, ("P", "M"): 128,
+    ("P", "N"): 124, ("P", "P"): 215, ("P", "Q"): 139, ("P", "R"): 112,
+    ("Q", "A"): 124, ("Q", "C"): 61, ("Q", "D"): 154, ("Q", "N"): 169,
+    ("Q", "Q"): 215, ("Q", "R"): 172, ("R", "A"): 103, ("R", "R"): 215,
+    ("S", "A"): 116, ("S", "C"): 103, ("S", "D"): 150, ("S", "E"): 135,
+    ("S", "F"): 60, ("S", "G"): 159, ("S", "H"): 126, ("S", "I"): 73,
+    ("S", "K"): 94, ("S", "L"): 70, ("S", "M"): 80, ("S", "N"): 169,
+    ("S", "P"): 141, ("S", "Q"): 147, ("S", "R"): 105, ("S", "S"): 215,
+    ("T", "A"): 157, ("T", "C"): 66, ("T", "D"): 130, ("T", "E"): 150,
+    ("T", "F"): 112, ("T", "G"): 156, ("T", "H"): 168, ("T", "I"): 126,
+    ("T", "K"): 137, ("T", "L"): 123, ("T", "M"): 134, ("T", "N"): 150,
+    ("T", "P"): 177, ("T", "Q"): 173, ("T", "R"): 144, ("T", "S"): 157,
+    ("T", "T"): 215, ("V", "A"): 151, ("V", "C"): 23, ("V", "D"): 63,
+    ("V", "E"): 94, ("V", "F"): 165, ("V", "G"): 106, ("V", "H"): 131,
+    ("V", "I"): 186, ("V", "K"): 118, ("V", "L"): 183, ("V", "M"): 194,
+    ("V", "N"): 82, ("V", "P"): 147, ("V", "Q"): 119, ("V", "R"): 119,
+    ("V", "S"): 91, ("V", "T"): 146, ("V", "V"): 215, ("V", "W"): 127,
+    ("V", "Y"): 160, ("W", "A"): 67, ("W", "C"): 0, ("W", "D"): 34,
+    ("W", "E"): 63, ("W", "F"): 175, ("W", "G"): 31, ("W", "H"): 100,
+    ("W", "I"): 154, ("W", "K"): 105, ("W", "L"): 154, ("W", "M"): 148,
+    ("W", "N"): 41, ("W", "P"): 68, ("W", "Q"): 85, ("W", "R"): 114,
+    ("W", "S"): 38, ("W", "T"): 87, ("W", "W"): 215, ("Y", "A"): 103,
+    ("Y", "C"): 21, ("Y", "D"): 55, ("Y", "E"): 93, ("Y", "F"): 193,
+    ("Y", "G"): 68, ("Y", "H"): 132, ("Y", "I"): 182, ("Y", "K"): 130,
+    ("Y", "L"): 179, ("Y", "M"): 179, ("Y", "N"): 72, ("Y", "P"): 105,
+    ("Y", "Q"): 116, ("Y", "R"): 138, ("Y", "S"): 71, ("Y", "T"): 123,
+    ("Y", "W"): 178, ("Y", "Y"): 215
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/ident.cmp
+ident = {
+    ("W", "F"): -1, ("L", "R"): -1, ("I", "I"): 6, ("Q", "Q"): 6,
+    ("W", "N"): -1, ("V", "I"): -1, ("H", "T"): -1, ("H", "P"): -1,
+    ("W", "V"): -1, ("Q", "E"): -1, ("W", "R"): -1, ("Q", "A"): -1,
+    ("H", "H"): 6, ("H", "D"): -1, ("L", "N"): -1, ("Y", "M"): -1,
+    ("Y", "I"): -1, ("Y", "E"): -1, ("E", "S"): -1, ("Y", "A"): -1,
+    ("Y", "Y"): 6, ("T", "C"): -1, ("E", "C"): -1, ("Y", "Q"): -1,
+    ("E", "G"): -1, ("V", "A"): -1, ("C", "C"): 6, ("M", "R"): -1,
+    ("P", "T"): -1, ("V", "E"): -1, ("P", "P"): 6, ("I", "T"): -1,
+    ("K", "S"): -1, ("R", "G"): -1, ("I", "P"): -1, ("R", "C"): -1,
+    ("A", "T"): -1, ("K", "K"): 6, ("A", "P"): -1, ("V", "M"): -1,
+    ("I", "D"): -1, ("K", "C"): -1, ("K", "G"): -1, ("R", "S"): -1,
+    ("F", "Q"): -1, ("F", "A"): -1, ("V", "V"): 6, ("M", "N"): -1,
+    ("F", "E"): -1, ("D", "N"): -1, ("F", "I"): -1, ("F", "M"): -1,
+    ("M", "S"): -1, ("S", "S"): 6, ("L", "Q"): -1, ("W", "E"): -1,
+    ("W", "A"): -1, ("W", "M"): -1, ("H", "S"): -1, ("W", "I"): -1,
+    ("S", "C"): -1, ("L", "A"): -1, ("L", "E"): -1, ("W", "Q"): -1,
+    ("H", "G"): -1, ("Q", "N"): -1, ("H", "C"): -1, ("L", "M"): -1,
+    ("W", "Y"): -1, ("Y", "N"): -1, ("E", "P"): -1, ("Y", "F"): -1,
+    ("E", "T"): -1, ("A", "A"): 6, ("I", "N"): -1, ("G", "A"): -1,
+    ("Y", "V"): -1, ("E", "D"): -1, ("W", "H"): -1, ("Y", "R"): -1,
+    ("M", "Q"): -1, ("P", "S"): -1, ("R", "H"): -1, ("A", "C"): -1,
+    ("R", "D"): -1, ("K", "P"): -1, ("L", "D"): -1, ("K", "T"): -1,
+    ("V", "N"): -1, ("M", "A"): -1, ("K", "H"): -1, ("V", "R"): -1,
+    ("P", "C"): -1, ("M", "E"): -1, ("A", "S"): -1, ("T", "T"): 6,
+    ("R", "T"): -1, ("I", "G"): -1, ("R", "P"): -1, ("K", "D"): -1,
+    ("I", "C"): -1, ("F", "R"): -1, ("F", "V"): -1, ("L", "C"): -1,
+    ("F", "F"): 6, ("D", "A"): -1, ("F", "N"): -1, ("W", "D"): -1,
+    ("L", "P"): -1, ("Q", "S"): -1, ("N", "C"): -1, ("N", "G"): -1,
+    ("H", "N"): -1, ("W", "T"): -1, ("Q", "G"): -1, ("W", "P"): -1,
+    ("Q", "C"): -1, ("N", "S"): -1, ("L", "H"): -1, ("L", "L"): 6,
+    ("G", "T"): -1, ("M", "M"): 6, ("G", "P"): -1, ("Y", "K"): -1,
+    ("Y", "G"): -1, ("Y", "C"): -1, ("E", "A"): -1, ("E", "E"): 6,
+    ("Y", "S"): -1, ("M", "P"): -1, ("V", "C"): -1, ("M", "T"): -1,
+    ("V", "G"): -1, ("R", "E"): -1, ("V", "K"): -1, ("K", "Q"): -1,
+    ("R", "A"): -1, ("I", "R"): -1, ("N", "A"): -1, ("V", "S"): -1,
+    ("M", "D"): -1, ("M", "H"): -1, ("K", "A"): -1, ("R", "Q"): -1,
+    ("K", "E"): -1, ("F", "S"): -1, ("I", "K"): -1, ("D", "P"): -1,
+    ("D", "T"): -1, ("I", "M"): -1, ("F", "C"): -1, ("W", "L"): -1,
+    ("F", "G"): -1, ("F", "K"): -1, ("F", "T"): -1, ("D", "D"): 6,
+    ("Q", "T"): -1, ("W", "G"): -1, ("Q", "P"): -1, ("W", "C"): -1,
+    ("W", "K"): -1, ("H", "Q"): -1, ("Q", "D"): -1, ("W", "W"): 6,
+    ("V", "L"): -1, ("L", "G"): -1, ("W", "S"): -1, ("L", "K"): -1,
+    ("N", "P"): -1, ("H", "E"): -1, ("N", "T"): -1, ("H", "A"): -1,
+    ("Y", "L"): -1, ("Y", "H"): -1, ("G", "S"): -1, ("Y", "D"): -1,
+    ("V", "Q"): -1, ("L", "T"): -1, ("G", "G"): 6, ("G", "C"): -1,
+    ("E", "N"): -1, ("Y", "T"): -1, ("Y", "P"): -1, ("R", "N"): -1,
+    ("V", "D"): -1, ("K", "R"): -1, ("V", "H"): -1, ("I", "Q"): -1,
+    ("V", "P"): -1, ("M", "C"): -1, ("K", "N"): -1, ("V", "T"): -1,
+    ("M", "G"): -1, ("T", "S"): -1, ("I", "E"): -1, ("M", "K"): -1,
+    ("I", "A"): -1, ("N", "N"): 6, ("R", "R"): 6, ("F", "P"): -1,
+    ("L", "I"): -1, ("I", "S"): -1, ("D", "S"): -1, ("L", "S"): -1,
+    ("I", "H"): -1, ("F", "D"): -1, ("D", "C"): -1, ("F", "H"): -1,
+    ("D", "G"): -1, ("F", "L"): -1
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/johnson.cmp
+johnson = {
+    ("W", "F"): 3.4, ("S", "P"): -1.0, ("N", "M"): -3.7, ("Q", "Q"): 9.0,
+    ("N", "A"): -1.4, ("N", "E"): -0.7, ("W", "V"): -4.9, ("Q", "E"): 2.4,
+    ("L", "H"): -4.2, ("W", "R"): -3.8, ("Q", "A"): -0.6, ("S", "D"): -0.2,
+    ("H", "H"): 12.7, ("Q", "M"): -0.6, ("S", "H"): -2.6, ("H", "D"): -0.7,
+    ("Q", "I"): -7.0, ("S", "L"): -5.2, ("Y", "M"): -1.3, ("Y", "I"): -2.5,
+    ("Y", "E"): -3.7, ("Y", "A"): -4.0, ("G", "F"): -8.6, ("V", "T"): -1.9,
+    ("Y", "Y"): 10.5, ("V", "H"): -3.9, ("E", "C"): -6.9, ("Y", "Q"): -5.1,
+    ("V", "A"): -0.5, ("C", "C"): 16.1, ("V", "E"): -4.2, ("T", "N"): 0.1,
+    ("R", "K"): 3.2, ("P", "P"): 10.3, ("V", "I"): 3.9, ("R", "G"): -2.8,
+    ("V", "M"): 0.7, ("T", "F"): -5.0, ("R", "C"): -5.6, ("V", "Q"): -3.6,
+    ("K", "K"): 7.6, ("P", "D"): -1.0, ("I", "H"): -5.1, ("M", "F"): -0.6,
+    ("I", "D"): -4.8, ("K", "C"): -8.7, ("P", "L"): -2.8, ("K", "G"): -3.5,
+    ("P", "H"): -4.3, ("T", "R"): -1.4, ("F", "A"): -3.2, ("F", "E"): -6.4,
+    ("S", "S"): 5.8, ("W", "E"): -7.6, ("N", "N"): 8.0, ("W", "M"): -0.9,
+    ("Q", "C"): -6.9, ("N", "F"): -3.8, ("S", "C"): -7.7, ("L", "A"): -3.3,
+    ("S", "G"): -1.3, ("L", "E"): -5.6, ("W", "Q"): -8.2, ("H", "G"): -3.2,
+    ("S", "K"): -1.5, ("Q", "N"): -0.8, ("V", "D"): -5.2, ("H", "C"): -8.2,
+    ("Y", "N"): -1.3, ("Y", "F"): 3.4, ("W", "I"): -3.3, ("C", "A"): -3.4,
+    ("G", "E"): -2.5, ("G", "A"): -0.5, ("Y", "V"): -1.8, ("E", "D"): 2.4,
+    ("W", "H"): -4.0, ("Y", "R"): -2.1, ("N", "I"): -4.7, ("R", "L"): -3.7,
+    ("T", "I"): -3.2, ("Q", "L"): -4.4, ("R", "H"): 0.1, ("T", "M"): -3.2,
+    ("V", "F"): -1.3, ("R", "D"): -3.4, ("T", "A"): -0.8, ("T", "P"): -2.0,
+    ("T", "E"): -0.5, ("V", "N"): -5.7, ("P", "G"): -2.5, ("M", "A"): -1.5,
+    ("K", "H"): 0.1, ("V", "R"): -4.9, ("P", "C"): -8.9, ("M", "E"): -2.8,
+    ("V", "V"): 7.0, ("T", "T"): 6.8, ("M", "I"): 2.6, ("T", "Q"): -0.4,
+    ("I", "G"): -5.5, ("P", "K"): -0.6, ("M", "M"): 11.2, ("K", "D"): -1.5,
+    ("I", "C"): -7.7, ("L", "C"): -8.7, ("F", "F"): 10.4, ("D", "A"): -1.6,
+    ("S", "R"): -0.6, ("W", "D"): -6.0, ("N", "C"): -7.6, ("N", "G"): -1.4,
+    ("W", "T"): -9.3, ("Q", "G"): -2.8, ("S", "F"): -4.8, ("W", "P"): -7.4,
+    ("L", "D"): -8.0, ("H", "F"): -1.7, ("Q", "K"): 1.1, ("S", "N"): 1.0,
+    ("L", "L"): 7.3, ("Q", "F"): -6.4, ("Y", "K"): -3.7, ("Y", "G"): -5.4,
+    ("Y", "C"): -7.7, ("G", "D"): -2.1, ("E", "A"): -0.7, ("Y", "W"): 2.3,
+    ("E", "E"): 8.6, ("Y", "S"): -3.4, ("R", "M"): -4.2, ("V", "C"): -4.8,
+    ("T", "H"): -3.0, ("R", "I"): -5.4, ("V", "G"): -5.6, ("T", "L"): -4.6,
+    ("R", "E"): -0.2, ("V", "K"): -3.7, ("R", "Q"): 2.1, ("R", "A"): -1.6,
+    ("T", "D"): -1.8, ("P", "F"): -5.0, ("V", "S"): -4.3, ("K", "I"): -4.7,
+    ("M", "D"): -5.9, ("W", "W"): 15.2, ("M", "H"): -2.3, ("P", "N"): -2.4,
+    ("I", "F"): 0.5, ("K", "A"): -0.9, ("M", "L"): 4.4, ("K", "E"): 1.1,
+    ("N", "K"): 0.1, ("R", "P"): -3.6, ("L", "F"): 1.8, ("F", "C"): -4.4,
+    ("W", "G"): -6.3, ("W", "L"): -1.0, ("D", "D"): 8.5, ("N", "H"): 1.7,
+    ("S", "Q"): -1.2, ("Q", "P"): -3.6, ("N", "L"): -4.8, ("W", "K"): -5.4,
+    ("Q", "D"): -1.1, ("W", "N"): -6.1, ("S", "A"): 0.0, ("L", "G"): -7.2,
+    ("W", "S"): -6.2, ("S", "E"): -2.2, ("L", "K"): -3.4, ("H", "E"): -2.3,
+    ("S", "I"): -4.7, ("Q", "H"): 1.4, ("H", "A"): -3.1, ("S", "M"): -4.8,
+    ("Y", "L"): -2.4, ("Y", "H"): -0.4, ("Y", "D"): -3.8, ("G", "G"): 8.0,
+    ("G", "C"): -8.2, ("Y", "T"): -2.7, ("W", "C"): -9.1, ("Y", "P"): -7.0,
+    ("T", "K"): -0.2, ("R", "N"): -1.5, ("A", "A"): 6.0, ("W", "A"): -5.8,
+    ("T", "C"): -6.0, ("N", "D"): 2.6, ("R", "F"): -6.0, ("T", "G"): -3.8,
+    ("V", "L"): 1.8, ("V", "P"): -5.2, ("P", "E"): -1.5, ("M", "C"): -4.4,
+    ("I", "I"): 8.1, ("P", "A"): -1.0, ("M", "G"): -5.2, ("T", "S"): 2.0,
+    ("I", "E"): -4.8, ("P", "M"): -9.8, ("M", "K"): -1.9, ("K", "F"): -5.6,
+    ("I", "A"): -2.2, ("P", "I"): -5.7, ("R", "R"): 10.0, ("L", "I"): 2.6,
+    ("F", "D"): -7.0, ("D", "C"): -9.7
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/levin.cmp
+levin = {
+    ("W", "F"): 0, ("L", "R"): -1, ("S", "P"): 0, ("I", "I"): 2,
+    ("Q", "Q"): 2, ("N", "A"): 0, ("H", "T"): 0, ("N", "E"): 0,
+    ("H", "P"): 0, ("W", "V"): 0, ("Q", "E"): 1, ("W", "R"): 0,
+    ("Q", "A"): 0, ("S", "D"): 0, ("H", "H"): 2, ("H", "D"): 0,
+    ("L", "N"): -1, ("W", "A"): -1, ("Y", "M"): 0, ("Y", "I"): 0,
+    ("Y", "E"): -1, ("Y", "A"): -1, ("Y", "Y"): 2, ("Y", "Q"): -1,
+    ("E", "G"): 0, ("A", "D"): 0, ("C", "C"): 2, ("M", "R"): -1,
+    ("V", "E"): -1, ("T", "N"): 0, ("R", "K"): 1, ("P", "P"): 3,
+    ("I", "T"): 0, ("K", "S"): 0, ("R", "G"): 0, ("I", "P"): -1,
+    ("C", "G"): 0, ("C", "S"): 0, ("A", "P"): -1, ("I", "D"): -1,
+    ("M", "I"): 0, ("K", "G"): 0, ("M", "N"): -1, ("F", "Q"): -1,
+    ("I", "V"): 1, ("F", "A"): -1, ("V", "V"): 2, ("F", "E"): -1,
+    ("C", "M"): 0, ("F", "I"): 1, ("F", "M"): 0, ("S", "S"): 2,
+    ("L", "Q"): -1, ("W", "E"): -1, ("N", "N"): 3, ("V", "A"): 0,
+    ("C", "K"): 0, ("W", "M"): 0, ("H", "S"): 0, ("L", "V"): 1,
+    ("L", "A"): 0, ("H", "K"): 0, ("S", "G"): 0, ("L", "E"): -1,
+    ("W", "Q"): -1, ("H", "G"): 0, ("Q", "N"): 1, ("T", "A"): 0,
+    ("L", "M"): 2, ("W", "Y"): 0, ("Y", "N"): -1, ("E", "P"): -1,
+    ("Y", "F"): 1, ("W", "I"): 0, ("R", "S"): 0, ("Y", "V"): 0,
+    ("E", "D"): 1, ("W", "H"): -1, ("Y", "R"): -1, ("M", "Q"): -1,
+    ("A", "G"): 0, ("C", "D"): 0, ("R", "D"): 0, ("C", "H"): 0,
+    ("T", "P"): 0, ("K", "T"): 0, ("V", "N"): -1, ("P", "G"): 0,
+    ("M", "A"): 0, ("C", "P"): 0, ("V", "R"): -1, ("M", "V"): 0,
+    ("M", "E"): -1, ("C", "T"): 0, ("I", "K"): -1, ("R", "T"): 0,
+    ("T", "Q"): 0, ("I", "G"): -1, ("R", "P"): 0, ("K", "D"): 0,
+    ("F", "R"): -1, ("F", "V"): 0, ("K", "P"): 0, ("L", "C"): 0,
+    ("F", "F"): 2, ("F", "N"): -1, ("V", "D"): -1, ("L", "P"): -1,
+    ("W", "K"): -1, ("L", "T"): 0, ("I", "N"): -1, ("I", "S"): -1,
+    ("H", "R"): 0, ("N", "G"): 0, ("C", "I"): 0, ("H", "N"): 0,
+    ("W", "T"): -1, ("Q", "G"): 0, ("W", "P"): -1, ("L", "D"): -1,
+    ("L", "H"): -1, ("S", "N"): 0, ("L", "L"): 2, ("M", "M"): 2,
+    ("Y", "K"): -1, ("Y", "G"): -1, ("Y", "C"): -1, ("E", "E"): 2,
+    ("Y", "S"): -1, ("M", "P"): -1, ("C", "A"): 0, ("M", "T"): 0,
+    ("V", "G"): -1, ("C", "E"): 0, ("R", "E"): 0, ("V", "K"): -1,
+    ("K", "Q"): 0, ("R", "A"): 0, ("I", "R"): -1, ("T", "D"): 0,
+    ("V", "S"): -1, ("C", "Q"): 0, ("M", "D"): -1, ("W", "W"): 2,
+    ("M", "H"): -1, ("T", "G"): 0, ("K", "A"): 0, ("R", "Q"): 0,
+    ("T", "T"): 2, ("F", "S"): -1, ("D", "P"): 0, ("F", "C"): -1,
+    ("W", "L"): 0, ("F", "G"): -1, ("F", "K"): -1, ("D", "D"): 2,
+    ("L", "S"): -1, ("W", "G"): -1, ("S", "Q"): 0, ("Q", "P"): 0,
+    ("W", "C"): -1, ("N", "D"): 1, ("H", "Q"): 0, ("Q", "D"): 0,
+    ("W", "N"): -1, ("S", "A"): 1, ("L", "G"): -1, ("W", "S"): -1,
+    ("S", "E"): 0, ("L", "K"): -1, ("N", "P"): 0, ("H", "E"): 0,
+    ("H", "A"): 0, ("Y", "L"): 0, ("Y", "H"): 0, ("Y", "D"): -1,
+    ("V", "Q"): -1, ("G", "G"): 2, ("Y", "T"): -1, ("R", "N"): 0,
+    ("Y", "P"): -1, ("A", "E"): 1, ("C", "V"): 0, ("M", "S"): -1,
+    ("A", "A"): 2, ("V", "H"): -1, ("T", "E"): 0, ("C", "N"): 0,
+    ("I", "Q"): -1, ("C", "R"): 0, ("V", "P"): -1, ("K", "E"): 0,
+    ("K", "N"): 1, ("V", "T"): 0, ("M", "G"): -1, ("T", "S"): 0,
+    ("I", "E"): -1, ("M", "K"): -1, ("I", "A"): 0, ("R", "R"): 2,
+    ("F", "P"): -1, ("L", "I"): 0, ("W", "D"): -1, ("F", "T"): -1,
+    ("K", "K"): 2, ("I", "H"): -1, ("F", "D"): -1, ("F", "H"): -1,
+    ("D", "G"): 0, ("F", "L"): 0
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/mclach.cmp
+mclach = {
+    ("N", "I"): 1, ("K", "V"): 2, ("S", "P"): 3, ("N", "M"): 2,
+    ("L", "V"): 5, ("N", "A"): 3, ("H", "T"): 4, ("N", "E"): 4,
+    ("Q", "Y"): 1, ("H", "P"): 3, ("W", "V"): 2, ("H", "L"): 2,
+    ("Q", "A"): 3, ("H", "H"): 8, ("N", "Q"): 4, ("Q", "M"): 3,
+    ("Q", "I"): 0, ("S", "L"): 2, ("G", "V"): 2, ("Y", "M"): 2,
+    ("K", "L"): 2, ("Y", "I"): 3, ("E", "S"): 4, ("K", "A"): 3,
+    ("E", "W"): 1, ("G", "F"): 0, ("E", "K"): 4, ("Y", "Y"): 9,
+    ("K", "F"): 0, ("E", "G"): 3, ("C", "C"): 9, ("C", "G"): 1,
+    ("M", "V"): 4, ("P", "P"): 8, ("A", "L"): 2, ("K", "S"): 3,
+    ("R", "G"): 3, ("K", "W"): 1, ("R", "C"): 1, ("I", "L"): 5,
+    ("C", "S"): 2, ("C", "W"): 2, ("K", "C"): 0, ("R", "W"): 3,
+    ("P", "L"): 1, ("K", "G"): 3, ("R", "S"): 4, ("D", "R"): 1,
+    ("D", "V"): 1, ("D", "N"): 5, ("F", "I"): 3, ("F", "M"): 5,
+    ("D", "F"): 1, ("S", "S"): 8, ("Q", "V"): 2, ("S", "W"): 3,
+    ("Q", "R"): 5, ("N", "N"): 8, ("H", "W"): 3, ("W", "M"): 1,
+    ("H", "S"): 3, ("W", "I"): 3, ("T", "F"): 1, ("Q", "F"): 0,
+    ("S", "G"): 3, ("H", "G"): 2, ("C", "P"): 0, ("N", "R"): 3,
+    ("H", "C"): 3, ("N", "V"): 1, ("E", "P"): 4, ("Y", "F"): 6,
+    ("E", "T"): 4, ("G", "Y"): 0, ("E", "H"): 2, ("E", "L"): 1,
+    ("Y", "V"): 3, ("G", "M"): 1, ("G", "I"): 1, ("P", "W"): 0,
+    ("R", "L"): 2, ("T", "I"): 3, ("A", "G"): 3, ("R", "H"): 5,
+    ("T", "M"): 3, ("T", "A"): 3, ("K", "T"): 3, ("P", "G"): 3,
+    ("T", "Y"): 1, ("A", "W"): 1, ("C", "T"): 2, ("V", "V"): 8,
+    ("R", "T"): 3, ("R", "P"): 3, ("D", "Y"): 1, ("F", "V"): 3,
+    ("D", "Q"): 4, ("K", "P"): 3, ("D", "I"): 1, ("D", "M"): 2,
+    ("F", "F"): 9, ("D", "A"): 3, ("D", "E"): 5, ("N", "K"): 4,
+    ("Q", "W"): 2, ("S", "V"): 2, ("Q", "S"): 4, ("H", "V"): 2,
+    ("W", "L"): 3, ("N", "G"): 3, ("Q", "G"): 2, ("S", "F"): 2,
+    ("Q", "C"): 0, ("H", "F"): 4, ("N", "S"): 5, ("Q", "K"): 4,
+    ("N", "W"): 0, ("L", "L"): 8, ("E", "Y"): 2, ("M", "M"): 8,
+    ("E", "Q"): 5, ("E", "I"): 1, ("E", "M"): 1, ("E", "A"): 4,
+    ("G", "L"): 1, ("Y", "W"): 6, ("E", "E"): 8, ("R", "M"): 1,
+    ("P", "V"): 2, ("A", "F"): 1, ("C", "A"): 1, ("R", "I"): 1,
+    ("T", "L"): 3, ("I", "V"): 5, ("C", "I"): 1, ("R", "A"): 2,
+    ("C", "Y"): 1, ("C", "M"): 3, ("P", "F"): 1, ("A", "V"): 3,
+    ("K", "I"): 1, ("R", "Y"): 2, ("K", "M"): 1, ("K", "H"): 4,
+    ("T", "P"): 3, ("M", "L"): 6, ("T", "T"): 8, ("C", "L"): 0,
+    ("D", "P"): 3, ("N", "F"): 0, ("K", "Y"): 1, ("D", "T"): 3,
+    ("D", "H"): 4, ("D", "L"): 1, ("K", "K"): 8, ("D", "D"): 8,
+    ("Q", "T"): 3, ("N", "H"): 4, ("Q", "P"): 3, ("N", "L"): 1,
+    ("H", "Y"): 4, ("S", "Y"): 3, ("W", "W"): 9, ("H", "M"): 3,
+    ("S", "A"): 4, ("H", "I"): 2, ("Q", "L"): 3, ("N", "P"): 1,
+    ("S", "I"): 2, ("Q", "H"): 4, ("N", "T"): 3, ("H", "A"): 3,
+    ("S", "M"): 2, ("Y", "L"): 3, ("G", "W"): 1, ("E", "R"): 3,
+    ("E", "V"): 2, ("G", "G"): 8, ("T", "V"): 3, ("E", "F"): 0,
+    ("C", "F"): 0, ("A", "A"): 8, ("K", "R"): 5, ("A", "M"): 3,
+    ("Q", "Q"): 8, ("R", "F"): 1, ("T", "G"): 2, ("A", "I"): 2,
+    ("P", "Y"): 0, ("C", "V"): 1, ("I", "I"): 8, ("P", "A"): 4,
+    ("T", "S"): 5, ("P", "M"): 1, ("R", "V"): 2, ("T", "W"): 2,
+    ("A", "Y"): 1, ("P", "I"): 1, ("R", "R"): 8, ("N", "Y"): 2,
+    ("D", "S"): 3, ("D", "W"): 0, ("M", "I"): 5, ("D", "K"): 3,
+    ("N", "C"): 1, ("E", "C"): 0, ("D", "C"): 1, ("D", "G"): 3,
+    ("F", "L"): 5, ("W", "F"): 6
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/miyata.cmp
+# This similarity matrix is converted from the original miyata physicochemical distance matrix (PMID: 439147) via this formula: similarity = 1.25 - distance
+# The offset 1.25 is chosen by Dr. Gerhard Vogt. Be cautious about this offset since there is no known solid reason to use it.
+miyata = {
+    ("W", "F"): 0.14, ("L", "R"): -1.37, ("S", "P"): 0.69, ("I", "I"): 1.25,
+    ("Q", "Q"): 1.25, ("N", "A"): -0.53, ("H", "T"): -0.07, ("N", "E"): 0.4,
+    ("H", "P"): -0.9, ("W", "V"): -1.26, ("W", "R"): -1.47, ("Q", "A"): -0.67,
+    ("H", "H"): 1.25, ("N", "Q"): 0.26, ("H", "D"): -0.47, ("L", "N"): -2.24,
+    ("Y", "M"): 0.32, ("Y", "I"): 0.39, ("Y", "E"): -1.97, ("E", "S"): -0.81,
+    ("Y", "A"): -1.93, ("Y", "Y"): 1.25, ("E", "C"): -2.01, ("Y", "Q"): -1.23,
+    ("E", "G"): -1.53, ("V", "A"): -0.6, ("C", "C"): 1.25, ("M", "R"): -1.04,
+    ("V", "E"): -1.72, ("R", "K"): 0.85, ("P", "P"): 1.25, ("I", "T"): -0.89,
+    ("K", "S"): -1.46, ("R", "G"): -2.33, ("I", "P"): -1.37, ("R", "C"): -1.81,
+    ("V", "Q"): -0.88, ("K", "K"): 1.25, ("A", "P"): 1.19, ("I", "D"): -2.73,
+    ("K", "C"): -2.02, ("M", "I"): 0.96, ("K", "G"): -2.29, ("R", "S"): -1.49,
+    ("F", "Q"): -1.56, ("I", "V"): 0.4, ("M", "V"): 0.63, ("F", "A"): -1.98,
+    ("V", "V"): 1.25, ("M", "N"): -1.83, ("F", "E"): -2.34, ("D", "N"): 0.6,
+    ("F", "I"): 0.64, ("F", "M"): 0.43, ("M", "S"): -1.42, ("S", "S"): 1.25,
+    ("L", "Q"): -1.45, ("W", "E"): -2.83, ("W", "A"): -2.98, ("W", "M"): -0.64,
+    ("H", "S"): -0.69, ("L", "V"): 0.34, ("S", "C"): -0.59, ("L", "A"): -1.51,
+    ("S", "G"): 0.4, ("L", "E"): -2.28, ("W", "Q"): -2.17, ("H", "G"): -1.53,
+    ("H", "C"): -1.31, ("W", "Y"): 0.19, ("Y", "N"): -2.17, ("E", "P"): -1.23,
+    ("I", "L"): 1.11, ("E", "T"): -0.58, ("W", "I"): -0.47, ("A", "A"): 1.25,
+    ("I", "N"): -2.12, ("G", "A"): 0.34, ("Y", "V"): -0.27, ("W", "H"): -1.91,
+    ("Y", "R"): -0.77, ("M", "Q"): -1.05, ("R", "H"): 0.43, ("A", "C"): -0.14,
+    ("R", "D"): -1.09, ("T", "A"): 0.35, ("T", "P"): 0.38, ("L", "D"): -2.85,
+    ("K", "T"): -0.85, ("V", "N"): -1.51, ("M", "A"): -1.17, ("K", "H"): 0.46,
+    ("V", "R"): -1.18, ("P", "C"): -0.08, ("M", "E"): -1.88, ("I", "K"): -1.59,
+    ("T", "T"): 1.25, ("R", "T"): -0.78, ("I", "G"): -2.35, ("R", "P"): -1.65,
+    ("K", "D"): -0.8, ("I", "C"): -0.38, ("F", "R"): -1.22, ("F", "V"): -0.18,
+    ("D", "Q"): -0.22, ("K", "P"): -1.69, ("F", "F"): 1.25, ("D", "A"): -1.12,
+    ("D", "E"): 0.35, ("F", "N"): -2.45, ("W", "D"): -3.63, ("L", "P"): -1.45,
+    ("Q", "S"): -0.4, ("N", "C"): -1.58, ("N", "G"): -0.71, ("H", "N"): -0.04,
+    ("W", "T"): -2.25, ("Q", "G"): -1.23, ("W", "P"): -2.92, ("Q", "C"): -1.23,
+    ("N", "S"): -0.06, ("L", "H"): -1.34, ("L", "L"): 1.25, ("M", "M"): 1.25,
+    ("G", "P"): 0.28, ("Y", "K"): -1.17, ("E", "Q"): 0.41, ("Y", "G"): -2.83,
+    ("Y", "C"): -1.13, ("E", "A"): -1.21, ("E", "E"): 1.25, ("Y", "S"): -2.08,
+    ("M", "P"): -1.11, ("V", "C"): 0.39, ("M", "T"): -0.61, ("V", "G"): -1.51,
+    ("R", "E"): -0.2, ("V", "K"): -1.45, ("K", "Q"): 0.19, ("R", "A"): -1.67,
+    ("I", "R"): -1.24, ("V", "S"): -0.9, ("M", "L"): 0.84, ("M", "D"): -2.44,
+    ("W", "W"): 1.25, ("M", "H"): -0.94, ("K", "A"): -1.71, ("R", "Q"): 0.12,
+    ("K", "E"): 0.11, ("F", "S"): -2.2, ("D", "P"): -1.15, ("D", "T"): -0.8,
+    ("F", "C"): -0.99, ("W", "L"): -0.48, ("F", "G"): -2.89, ("F", "K"): -1.6,
+    ("F", "T"): -1.35, ("D", "D"): 1.25, ("Q", "T"): 0.13, ("W", "G"): -3.88,
+    ("Q", "P"): -0.67, ("W", "C"): -2.09, ("W", "K"): -1.86, ("H", "Q"): 0.93,
+    ("L", "C"): -0.4, ("W", "N"): -3.14, ("S", "A"): 0.74, ("L", "G"): -2.42,
+    ("W", "S"): -3.13, ("L", "K"): -1.73, ("N", "P"): -0.55, ("H", "E"): 0.29,
+    ("N", "T"): -0.15, ("H", "A"): -0.92, ("Y", "L"): 0.31, ("Y", "H"): -1.02,
+    ("Y", "D"): -2.7, ("L", "T"): -1.0, ("G", "G"): 1.25, ("G", "C"): -0.97,
+    ("Y", "T"): -1.2, ("Y", "P"): -1.87, ("R", "N"): -0.79, ("V", "D"): -2.15,
+    ("T", "C"): -0.2, ("V", "H"): -0.86, ("T", "G"): -0.45, ("I", "Q"): -1.32,
+    ("V", "P"): -0.54, ("M", "C"): -0.21, ("K", "N"): -0.59, ("V", "T"): -0.17,
+    ("M", "G"): -2.09, ("T", "S"): 0.36, ("I", "E"): -2.14, ("M", "K"): -1.38,
+    ("I", "A"): -1.44, ("N", "N"): 1.25, ("R", "R"): 1.25, ("F", "P"): -1.92,
+    ("I", "S"): -1.7, ("D", "S"): -0.62, ("Y", "F"): 0.77, ("L", "S"): -1.79,
+    ("I", "H"): -1.2, ("F", "D"): -3.02, ("D", "C"): -2.23, ("F", "H"): -1.38,
+    ("D", "G"): -1.12, ("F", "L"): 0.62
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/nwsgappep.cmp
+nwsgappep = {
+    ("W", "F"): 1.3, ("S", "P"): 0.4, ("W", "B"): -0.7, ("N", "N"): 1.5,
+    ("N", "A"): 0.2, ("N", "E"): 0.5, ("Z", "Y"): -0.6, ("W", "V"): -0.8,
+    ("L", "B"): -0.5, ("W", "R"): 1.4, ("Q", "A"): 0.2, ("S", "D"): 0.2,
+    ("H", "H"): 1.5, ("Q", "M"): 0.0, ("S", "H"): -0.2, ("H", "D"): 0.4,
+    ("Q", "I"): -0.3, ("S", "L"): -0.4, ("Y", "M"): -0.1, ("Q", "E"): 0.7,
+    ("Y", "I"): 0.1, ("Y", "E"): -0.5, ("Y", "A"): -0.3, ("G", "F"): -0.6,
+    ("V", "T"): 0.2, ("G", "B"): 0.6, ("Y", "Y"): 1.5, ("N", "L"): -0.4,
+    ("E", "C"): -0.6, ("Y", "Q"): -0.6, ("Z", "Z"): 1.1, ("V", "A"): 0.2,
+    ("C", "C"): 1.5, ("V", "E"): -0.2, ("T", "N"): 0.2, ("R", "K"): 0.8,
+    ("P", "P"): 1.5, ("V", "I"): 1.1, ("T", "B"): 0.2, ("R", "G"): -0.3,
+    ("V", "M"): 0.6, ("T", "F"): -0.3, ("R", "C"): -0.3, ("V", "Q"): -0.2,
+    ("K", "K"): 1.5, ("M", "B"): -0.3, ("P", "D"): 0.1, ("I", "H"): -0.3,
+    ("M", "F"): 0.5, ("I", "D"): -0.2, ("K", "C"): -0.6, ("L", "L"): 1.5,
+    ("K", "G"): -0.1, ("P", "H"): 0.2, ("Z", "G"): 0.3, ("W", "M"): -0.3,
+    ("Z", "C"): -0.6, ("T", "R"): -0.1, ("Z", "K"): 0.4, ("F", "A"): -0.5,
+    ("Z", "W"): -0.8, ("F", "E"): -0.7, ("Z", "S"): 0.0, ("D", "B"): 1.1,
+    ("S", "S"): 1.5, ("W", "E"): -1.1, ("W", "A"): -0.8, ("N", "B"): 1.1,
+    ("Q", "C"): -0.6, ("Z", "Q"): 1.1, ("N", "F"): -0.5, ("S", "C"): 0.7,
+    ("Q", "F"): -0.8, ("S", "G"): 0.6, ("Q", "B"): 0.5, ("W", "Q"): -0.5,
+    ("H", "G"): -0.2, ("S", "K"): 0.2, ("L", "I"): 0.8, ("V", "D"): -0.2,
+    ("H", "C"): -0.1, ("L", "E"): -0.3, ("Y", "N"): -0.1, ("Y", "F"): 1.4,
+    ("W", "I"): -0.5, ("C", "A"): 0.3, ("G", "E"): 0.5, ("G", "A"): 0.7,
+    ("Y", "V"): -0.1, ("E", "D"): 1.0, ("W", "H"): -0.1, ("Y", "R"): -0.6,
+    ("N", "I"): -0.3, ("R", "L"): -0.4, ("T", "I"): 0.2, ("V", "B"): -0.2,
+    ("R", "H"): 0.5, ("T", "M"): 0.0, ("V", "F"): 0.2, ("R", "D"): 0.0,
+    ("T", "A"): 0.4, ("T", "P"): 0.3, ("T", "E"): 0.2, ("V", "N"): -0.3,
+    ("P", "G"): 0.3, ("M", "A"): 0.0, ("K", "H"): 0.1, ("V", "R"): -0.3,
+    ("P", "C"): 0.1, ("M", "E"): -0.2, ("V", "V"): 1.5, ("T", "T"): 1.5,
+    ("M", "I"): 0.6, ("T", "Q"): -0.1, ("I", "G"): -0.3, ("P", "K"): 0.1,
+    ("M", "M"): 1.5, ("K", "D"): 0.3, ("I", "C"): 0.2, ("Z", "D"): 0.9,
+    ("Y", "W"): 1.1, ("Z", "L"): -0.2, ("P", "L"): -0.3, ("Z", "I"): -0.2,
+    ("Z", "T"): 0.1, ("L", "C"): -0.8, ("F", "B"): -0.7, ("Z", "P"): 0.2,
+    ("F", "F"): 1.5, ("D", "A"): 0.3, ("S", "R"): 0.1, ("W", "D"): -1.1,
+    ("R", "R"): 1.5, ("W", "K"): 0.1, ("N", "M"): -0.3, ("N", "C"): -0.3,
+    ("N", "G"): 0.4, ("S", "B"): 0.3, ("W", "T"): -0.6, ("Q", "G"): 0.2,
+    ("S", "F"): -0.3, ("W", "P"): -0.8, ("L", "D"): -0.5, ("H", "F"): -0.1,
+    ("L", "H"): -0.2, ("S", "N"): 0.3, ("H", "B"): 0.4, ("Q", "K"): 0.4,
+    ("R", "P"): 0.3, ("Y", "K"): -0.6, ("Y", "B"): -0.3, ("Y", "G"): -0.7,
+    ("Y", "C"): 1.0, ("G", "D"): 0.7, ("E", "A"): 0.3, ("T", "S"): 0.3,
+    ("E", "E"): 1.5, ("Y", "S"): -0.4, ("R", "M"): 0.2, ("V", "C"): 0.2,
+    ("T", "H"): -0.1, ("R", "I"): -0.3, ("V", "S"): -0.1, ("V", "G"): 0.2,
+    ("T", "L"): -0.1, ("R", "E"): 0.0, ("V", "K"): -0.2, ("R", "Q"): 0.4,
+    ("R", "A"): -0.3, ("Z", "H"): 0.5, ("T", "D"): 0.2, ("P", "F"): -0.7,
+    ("L", "A"): -0.1, ("K", "I"): -0.2, ("M", "D"): -0.4, ("P", "B"): 0.1,
+    ("W", "W"): 1.5, ("M", "H"): -0.3, ("P", "N"): 0.0, ("I", "F"): 0.7,
+    ("K", "A"): 0.0, ("M", "L"): 1.3, ("I", "B"): -0.2, ("K", "E"): 0.3,
+    ("Z", "E"): 1.1, ("Q", "N"): 0.4, ("Z", "A"): 0.2, ("Z", "M"): -0.1,
+    ("L", "F"): 1.2, ("F", "C"): -0.1, ("W", "G"): -1.0, ("W", "L"): 0.5,
+    ("C", "B"): -0.4, ("B", "A"): 0.2, ("D", "D"): 1.5, ("N", "H"): 0.5,
+    ("S", "Q"): -0.1, ("Q", "P"): 0.3, ("W", "C"): -1.2, ("N", "D"): 0.7,
+    ("Q", "D"): 0.7, ("W", "N"): -0.3, ("S", "A"): 0.4, ("L", "G"): -0.5,
+    ("W", "S"): 0.3, ("S", "E"): 0.2, ("L", "K"): -0.3, ("H", "E"): 0.4,
+    ("S", "I"): -0.1, ("Q", "H"): 0.7, ("H", "A"): -0.1, ("S", "M"): -0.3,
+    ("Y", "L"): 0.3, ("Y", "H"): 0.3, ("Y", "D"): -0.5, ("G", "G"): 1.5,
+    ("G", "C"): 0.2, ("Y", "T"): -0.3, ("E", "B"): 0.7, ("Y", "P"): -0.8,
+    ("T", "K"): 0.2, ("R", "N"): 0.1, ("A", "A"): 1.5, ("N", "K"): 0.4,
+    ("T", "C"): 0.2, ("V", "H"): -0.3, ("Q", "Q"): 1.5, ("R", "F"): -0.5,
+    ("T", "G"): 0.4, ("V", "L"): 0.8, ("R", "B"): 0.1, ("V", "P"): 0.1,
+    ("P", "E"): 0.1, ("M", "C"): -0.6, ("I", "I"): 1.5, ("P", "A"): 0.5,
+    ("M", "G"): -0.3, ("K", "B"): 0.4, ("I", "E"): -0.2, ("P", "M"): -0.2,
+    ("M", "K"): 0.2, ("K", "F"): -0.7, ("I", "A"): 0.0, ("P", "I"): -0.2,
+    ("Q", "L"): -0.1, ("Z", "F"): -0.7, ("Z", "B"): 0.6, ("Z", "N"): 0.4,
+    ("Z", "V"): -0.2, ("F", "D"): -1.0, ("Z", "R"): 0.2, ("D", "C"): -0.5,
+    ("B", "B"): 1.1
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/pam120.cmp
+pam120 = {
+    ("W", "F"): -1, ("L", "R"): -4, ("S", "P"): 1, ("V", "T"): 0,
+    ("Q", "Q"): 6, ("N", "A"): -1, ("Z", "Y"): -5, ("W", "R"): 1,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 7, ("S", "H"): -2,
+    ("H", "D"): 0, ("L", "N"): -4, ("W", "A"): -7, ("Y", "M"): -4,
+    ("G", "R"): -4, ("Y", "I"): -2, ("Y", "E"): -5, ("B", "Y"): -3,
+    ("Y", "A"): -4, ("V", "D"): -3, ("B", "S"): 0, ("Y", "Y"): 8,
+    ("G", "N"): 0, ("E", "C"): -7, ("Y", "Q"): -5, ("Z", "Z"): 4,
+    ("V", "A"): 0, ("C", "C"): 9, ("M", "R"): -1, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 6, ("V", "I"): 3, ("V", "S"): -2,
+    ("Z", "P"): -1, ("V", "M"): 1, ("T", "F"): -4, ("V", "Q"): -3,
+    ("K", "K"): 5, ("P", "D"): -3, ("I", "H"): -4, ("I", "D"): -3,
+    ("T", "R"): -2, ("P", "L"): -3, ("K", "G"): -3, ("M", "N"): -3,
+    ("P", "H"): -1, ("F", "Q"): -6, ("Z", "G"): -2, ("X", "L"): -2,
+    ("T", "M"): -1, ("Z", "C"): -7, ("X", "H"): -2, ("D", "R"): -3,
+    ("B", "W"): -6, ("X", "D"): -2, ("Z", "K"): -1, ("F", "A"): -4,
+    ("Z", "W"): -7, ("F", "E"): -7, ("D", "N"): 2, ("B", "K"): 0,
+    ("X", "X"): -2, ("F", "I"): 0, ("B", "G"): 0, ("X", "T"): -1,
+    ("F", "M"): -1, ("B", "C"): -6, ("Z", "I"): -3, ("Z", "V"): -3,
+    ("S", "S"): 3, ("L", "Q"): -2, ("W", "E"): -8, ("Q", "R"): 1,
+    ("N", "N"): 4, ("W", "M"): -6, ("Q", "C"): -7, ("W", "I"): -6,
+    ("S", "C"): 0, ("L", "A"): -3, ("S", "G"): 1, ("L", "E"): -4,
+    ("W", "Q"): -6, ("H", "G"): -4, ("S", "K"): -1, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -4, ("Y", "N"): -2, ("G", "Q"): -3,
+    ("Y", "F"): 4, ("C", "A"): -3, ("V", "L"): 1, ("G", "E"): -1,
+    ("G", "A"): 1, ("K", "R"): 2, ("E", "D"): 3, ("Y", "R"): -5,
+    ("M", "Q"): -1, ("T", "I"): 0, ("C", "D"): -7, ("V", "F"): -3,
+    ("T", "A"): 1, ("T", "P"): -1, ("B", "P"): -2, ("T", "E"): -2,
+    ("V", "N"): -3, ("P", "G"): -2, ("M", "A"): -2, ("K", "H"): -2,
+    ("V", "R"): -3, ("P", "C"): -4, ("M", "E"): -3, ("K", "L"): -4,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): -2, ("I", "G"): -4,
+    ("P", "K"): -2, ("M", "M"): 8, ("K", "D"): -1, ("I", "C"): -3,
+    ("Z", "D"): 3, ("F", "R"): -5, ("X", "K"): -2, ("Q", "D"): 1,
+    ("X", "G"): -2, ("Z", "L"): -3, ("X", "C"): -4, ("Z", "H"): 1,
+    ("B", "L"): -4, ("B", "H"): 1, ("F", "F"): 8, ("X", "W"): -5,
+    ("B", "D"): 4, ("D", "A"): 0, ("S", "L"): -4, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -1, ("W", "D"): -8, ("V", "Y"): -3,
+    ("W", "L"): -3, ("H", "R"): 1, ("W", "H"): -3, ("H", "N"): 2,
+    ("W", "T"): -6, ("T", "T"): 4, ("S", "F"): -3, ("W", "P"): -7,
+    ("L", "D"): -5, ("B", "I"): -3, ("L", "H"): -3, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 5, ("Y", "K"): -5, ("E", "Q"): 2,
+    ("Y", "G"): -6, ("Z", "S"): -1, ("Y", "C"): -1, ("G", "D"): 0,
+    ("B", "V"): -3, ("E", "A"): 0, ("Y", "W"): -2, ("E", "E"): 5,
+    ("Y", "S"): -3, ("C", "N"): -5, ("V", "C"): -3, ("T", "H"): -3,
+    ("P", "R"): -1, ("V", "G"): -2, ("T", "L"): -3, ("V", "K"): -4,
+    ("K", "Q"): 0, ("R", "A"): -3, ("I", "R"): -2, ("T", "D"): -1,
+    ("P", "F"): -5, ("I", "N"): -2, ("K", "I"): -3, ("M", "D"): -4,
+    ("V", "W"): -8, ("W", "W"): 12, ("M", "H"): -4, ("P", "N"): -2,
+    ("K", "A"): -2, ("M", "L"): 3, ("K", "E"): -1, ("Z", "E"): 4,
+    ("X", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -3,
+    ("K", "C"): -7, ("B", "Q"): 0, ("X", "B"): -1, ("B", "M"): -4,
+    ("F", "C"): -6, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -5,
+    ("B", "E"): 3, ("X", "V"): -1, ("F", "K"): -7, ("B", "A"): 0,
+    ("X", "R"): -2, ("D", "D"): 5, ("W", "G"): -8, ("Z", "F"): -6,
+    ("S", "Q"): -2, ("W", "C"): -8, ("W", "K"): -5, ("H", "Q"): 3,
+    ("L", "C"): -7, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -5,
+    ("W", "S"): -2, ("S", "E"): -1, ("H", "E"): -1, ("S", "I"): -2,
+    ("H", "A"): -3, ("S", "M"): -2, ("Y", "L"): -2, ("Y", "H"): -1,
+    ("Y", "D"): -5, ("E", "R"): -3, ("X", "P"): -2, ("G", "G"): 5,
+    ("G", "C"): -4, ("E", "N"): 1, ("Y", "T"): -3, ("Y", "P"): -6,
+    ("T", "K"): -1, ("A", "A"): 3, ("P", "Q"): 0, ("T", "C"): -3,
+    ("V", "H"): -3, ("T", "G"): -1, ("I", "Q"): -3, ("Z", "T"): -2,
+    ("C", "R"): -4, ("V", "P"): -2, ("P", "E"): -2, ("M", "C"): -6,
+    ("K", "N"): 1, ("I", "I"): 6, ("P", "A"): 1, ("M", "G"): -4,
+    ("T", "S"): 2, ("I", "E"): -3, ("P", "M"): -3, ("M", "K"): 0,
+    ("I", "A"): -1, ("P", "I"): -3, ("R", "R"): 6, ("X", "M"): -2,
+    ("L", "I"): 1, ("X", "I"): -1, ("Z", "B"): 2, ("X", "E"): -1,
+    ("Z", "N"): 0, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 3,
+    ("F", "D"): -7, ("X", "Y"): -3, ("Z", "R"): -1, ("F", "H"): -3,
+    ("B", "F"): -5, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/pam180.cmp
+pam180 = {
+    ("W", "F"): 0, ("L", "R"): -4, ("S", "P"): 1, ("V", "T"): 0,
+    ("Q", "Q"): 6, ("N", "A"): 0, ("Z", "Y"): -6, ("W", "R"): 2,
+    ("Q", "A"): -1, ("S", "D"): 0, ("H", "H"): 8, ("S", "H"): -2,
+    ("H", "D"): 0, ("L", "N"): -4, ("W", "A"): -8, ("Y", "M"): -4,
+    ("G", "R"): -4, ("Y", "I"): -2, ("Y", "E"): -6, ("B", "Y"): -4,
+    ("Y", "A"): -5, ("V", "D"): -3, ("B", "S"): 1, ("Y", "Y"): 11,
+    ("G", "N"): 0, ("E", "C"): -7, ("Y", "Q"): -6, ("Z", "Z"): 5,
+    ("V", "A"): 0, ("C", "C"): 13, ("M", "R"): -1, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 8, ("V", "I"): 5, ("V", "S"): -2,
+    ("Z", "P"): -1, ("V", "M"): 2, ("T", "F"): -4, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -2, ("I", "H"): -4, ("I", "D"): -3,
+    ("T", "R"): -2, ("P", "L"): -4, ("K", "G"): -3, ("M", "N"): -3,
+    ("P", "H"): -1, ("F", "Q"): -6, ("Z", "G"): -1, ("X", "L"): -2,
+    ("T", "M"): -1, ("Z", "C"): -7, ("X", "H"): -1, ("D", "R"): -3,
+    ("B", "W"): -7, ("X", "D"): -1, ("Z", "K"): 0, ("F", "A"): -5,
+    ("Z", "W"): -8, ("F", "E"): -7, ("D", "N"): 3, ("B", "K"): 0,
+    ("X", "X"): -1, ("F", "I"): 1, ("B", "G"): 0, ("X", "T"): -1,
+    ("F", "M"): 0, ("B", "C"): -6, ("Z", "I"): -3, ("Z", "V"): -3,
+    ("S", "S"): 3, ("L", "Q"): -2, ("W", "E"): -9, ("Q", "R"): 1,
+    ("N", "N"): 4, ("W", "M"): -6, ("Q", "C"): -7, ("W", "I"): -7,
+    ("S", "C"): 0, ("L", "A"): -3, ("S", "G"): 1, ("L", "E"): -5,
+    ("W", "Q"): -6, ("H", "G"): -3, ("S", "K"): -1, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -4, ("Y", "N"): -2, ("G", "Q"): -2,
+    ("Y", "F"): 7, ("C", "A"): -3, ("V", "L"): 2, ("G", "E"): 0,
+    ("G", "A"): 1, ("K", "R"): 4, ("E", "D"): 4, ("Y", "R"): -6,
+    ("M", "Q"): -1, ("T", "I"): 0, ("C", "D"): -7, ("V", "F"): -2,
+    ("T", "A"): 2, ("T", "P"): 0, ("B", "P"): -2, ("T", "E"): -1,
+    ("V", "N"): -3, ("P", "G"): -1, ("M", "A"): -2, ("K", "H"): -1,
+    ("V", "R"): -4, ("P", "C"): -4, ("M", "E"): -3, ("K", "L"): -4,
+    ("V", "V"): 6, ("M", "I"): 2, ("T", "Q"): -2, ("I", "G"): -4,
+    ("P", "K"): -2, ("M", "M"): 9, ("K", "D"): 0, ("I", "C"): -3,
+    ("Z", "D"): 3, ("F", "R"): -6, ("X", "K"): -1, ("Q", "D"): 2,
+    ("X", "G"): -2, ("Z", "L"): -3, ("X", "C"): -4, ("Z", "H"): 2,
+    ("B", "L"): -5, ("B", "H"): 1, ("F", "F"): 10, ("X", "W"): -6,
+    ("B", "D"): 4, ("D", "A"): 0, ("S", "L"): -4, ("X", "S"): 0,
+    ("F", "N"): -5, ("S", "R"): -1, ("W", "D"): -9, ("V", "Y"): -4,
+    ("W", "L"): -3, ("H", "R"): 2, ("W", "H"): -4, ("H", "N"): 2,
+    ("W", "T"): -7, ("T", "T"): 4, ("S", "F"): -4, ("W", "P"): -7,
+    ("L", "D"): -6, ("B", "I"): -3, ("L", "H"): -3, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 7, ("Y", "K"): -6, ("E", "Q"): 3,
+    ("Y", "G"): -7, ("Z", "S"): -1, ("Y", "C"): 0, ("G", "D"): 0,
+    ("B", "V"): -3, ("E", "A"): 0, ("Y", "W"): -1, ("E", "E"): 5,
+    ("Y", "S"): -4, ("C", "N"): -5, ("V", "C"): -3, ("T", "H"): -2,
+    ("P", "R"): -1, ("V", "G"): -2, ("T", "L"): -3, ("V", "K"): -4,
+    ("K", "Q"): 0, ("R", "A"): -3, ("I", "R"): -3, ("T", "D"): -1,
+    ("P", "F"): -6, ("I", "N"): -3, ("K", "I"): -3, ("M", "D"): -4,
+    ("V", "W"): -8, ("W", "W"): 18, ("M", "H"): -3, ("P", "N"): -1,
+    ("K", "A"): -2, ("M", "L"): 4, ("K", "E"): -1, ("Z", "E"): 5,
+    ("X", "N"): -1, ("Z", "A"): 0, ("Z", "M"): -2, ("X", "F"): -3,
+    ("K", "C"): -7, ("B", "Q"): 1, ("X", "B"): -1, ("B", "M"): -3,
+    ("F", "C"): -6, ("Z", "Q"): 5, ("X", "Z"): -1, ("F", "G"): -6,
+    ("B", "E"): 3, ("X", "V"): -1, ("F", "K"): -7, ("B", "A"): 0,
+    ("X", "R"): -2, ("D", "D"): 5, ("W", "G"): -9, ("Z", "F"): -7,
+    ("S", "Q"): -1, ("W", "C"): -10, ("W", "K"): -5, ("H", "Q"): 4,
+    ("L", "C"): -8, ("W", "N"): -5, ("S", "A"): 1, ("L", "G"): -6,
+    ("W", "S"): -3, ("S", "E"): -1, ("H", "E"): 0, ("S", "I"): -2,
+    ("H", "A"): -2, ("S", "M"): -2, ("Y", "L"): -2, ("Y", "H"): 0,
+    ("Y", "D"): -6, ("E", "R"): -2, ("X", "P"): -1, ("G", "G"): 6,
+    ("G", "C"): -5, ("E", "N"): 2, ("Y", "T"): -4, ("Y", "P"): -7,
+    ("T", "K"): 0, ("A", "A"): 3, ("P", "Q"): 0, ("T", "C"): -3,
+    ("V", "H"): -3, ("T", "G"): -1, ("I", "Q"): -3, ("Z", "T"): -1,
+    ("C", "R"): -5, ("V", "P"): -2, ("P", "E"): -1, ("M", "C"): -7,
+    ("K", "N"): 1, ("I", "I"): 6, ("P", "A"): 1, ("M", "G"): -4,
+    ("T", "S"): 2, ("I", "E"): -3, ("P", "M"): -3, ("M", "K"): 1,
+    ("I", "A"): -1, ("P", "I"): -3, ("R", "R"): 8, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 3, ("X", "E"): -1,
+    ("Z", "N"): 1, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 3,
+    ("F", "D"): -8, ("X", "Y"): -3, ("Z", "R"): 0, ("F", "H"): -3,
+    ("B", "F"): -6, ("F", "L"): 1, ("X", "Q"): -1, ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/pam250.cmp
+pam250 = {
+    ("W", "F"): 0, ("L", "R"): -3, ("S", "P"): 1, ("V", "T"): 0,
+    ("Q", "Q"): 4, ("N", "A"): 0, ("Z", "Y"): -4, ("W", "R"): 2,
+    ("Q", "A"): 0, ("S", "D"): 0, ("H", "H"): 6, ("S", "H"): -1,
+    ("H", "D"): 1, ("L", "N"): -3, ("W", "A"): -6, ("Y", "M"): -2,
+    ("G", "R"): -3, ("Y", "I"): -1, ("Y", "E"): -4, ("B", "Y"): -3,
+    ("Y", "A"): -3, ("V", "D"): -2, ("B", "S"): 0, ("Y", "Y"): 10,
+    ("G", "N"): 0, ("E", "C"): -5, ("Y", "Q"): -4, ("Z", "Z"): 3,
+    ("V", "A"): 0, ("C", "C"): 12, ("M", "R"): 0, ("V", "E"): -2,
+    ("T", "N"): 0, ("P", "P"): 6, ("V", "I"): 4, ("V", "S"): -1,
+    ("Z", "P"): 0, ("V", "M"): 2, ("T", "F"): -3, ("V", "Q"): -2,
+    ("K", "K"): 5, ("P", "D"): -1, ("I", "H"): -2, ("I", "D"): -2,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): 0, ("F", "Q"): -5, ("Z", "G"): 0, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -5, ("X", "H"): -1, ("D", "R"): -1,
+    ("B", "W"): -5, ("X", "D"): -1, ("Z", "K"): 0, ("F", "A"): -3,
+    ("Z", "W"): -6, ("F", "E"): -5, ("D", "N"): 2, ("B", "K"): 1,
+    ("X", "X"): -1, ("F", "I"): 1, ("B", "G"): 0, ("X", "T"): 0,
+    ("F", "M"): 0, ("B", "C"): -4, ("Z", "I"): -2, ("Z", "V"): -2,
+    ("S", "S"): 2, ("L", "Q"): -2, ("W", "E"): -7, ("Q", "R"): 1,
+    ("N", "N"): 2, ("W", "M"): -4, ("Q", "C"): -5, ("W", "I"): -5,
+    ("S", "C"): 0, ("L", "A"): -2, ("S", "G"): 1, ("L", "E"): -3,
+    ("W", "Q"): -5, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 1,
+    ("N", "R"): 0, ("H", "C"): -3, ("Y", "N"): -2, ("G", "Q"): -1,
+    ("Y", "F"): 7, ("C", "A"): -2, ("V", "L"): 2, ("G", "E"): 0,
+    ("G", "A"): 1, ("K", "R"): 3, ("E", "D"): 3, ("Y", "R"): -4,
+    ("M", "Q"): -1, ("T", "I"): 0, ("C", "D"): -5, ("V", "F"): -1,
+    ("T", "A"): 1, ("T", "P"): 0, ("B", "P"): -1, ("T", "E"): 0,
+    ("V", "N"): -2, ("P", "G"): 0, ("M", "A"): -1, ("K", "H"): 0,
+    ("V", "R"): -2, ("P", "C"): -3, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 4, ("M", "I"): 2, ("T", "Q"): -1, ("I", "G"): -3,
+    ("P", "K"): -1, ("M", "M"): 6, ("K", "D"): 0, ("I", "C"): -2,
+    ("Z", "D"): 3, ("F", "R"): -4, ("X", "K"): -1, ("Q", "D"): 2,
+    ("X", "G"): -1, ("Z", "L"): -3, ("X", "C"): -3, ("Z", "H"): 2,
+    ("B", "L"): -3, ("B", "H"): 1, ("F", "F"): 9, ("X", "W"): -4,
+    ("B", "D"): 3, ("D", "A"): 0, ("S", "L"): -3, ("X", "S"): 0,
+    ("F", "N"): -3, ("S", "R"): 0, ("W", "D"): -7, ("V", "Y"): -2,
+    ("W", "L"): -2, ("H", "R"): 2, ("W", "H"): -3, ("H", "N"): 2,
+    ("W", "T"): -5, ("T", "T"): 3, ("S", "F"): -3, ("W", "P"): -6,
+    ("L", "D"): -4, ("B", "I"): -2, ("L", "H"): -2, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 6, ("Y", "K"): -4, ("E", "Q"): 2,
+    ("Y", "G"): -5, ("Z", "S"): 0, ("Y", "C"): 0, ("G", "D"): 1,
+    ("B", "V"): -2, ("E", "A"): 0, ("Y", "W"): 0, ("E", "E"): 4,
+    ("Y", "S"): -3, ("C", "N"): -4, ("V", "C"): -2, ("T", "H"): -1,
+    ("P", "R"): 0, ("V", "G"): -1, ("T", "L"): -2, ("V", "K"): -2,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -2, ("T", "D"): 0,
+    ("P", "F"): -5, ("I", "N"): -2, ("K", "I"): -2, ("M", "D"): -3,
+    ("V", "W"): -6, ("W", "W"): 17, ("M", "H"): -2, ("P", "N"): 0,
+    ("K", "A"): -1, ("M", "L"): 4, ("K", "E"): 0, ("Z", "E"): 3,
+    ("X", "N"): 0, ("Z", "A"): 0, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -5, ("B", "Q"): 1, ("X", "B"): -1, ("B", "M"): -2,
+    ("F", "C"): -4, ("Z", "Q"): 3, ("X", "Z"): -1, ("F", "G"): -5,
+    ("B", "E"): 3, ("X", "V"): -1, ("F", "K"): -5, ("B", "A"): 0,
+    ("X", "R"): -1, ("D", "D"): 4, ("W", "G"): -7, ("Z", "F"): -5,
+    ("S", "Q"): -1, ("W", "C"): -8, ("W", "K"): -3, ("H", "Q"): 3,
+    ("L", "C"): -6, ("W", "N"): -4, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -2, ("S", "E"): 0, ("H", "E"): 1, ("S", "I"): -1,
+    ("H", "A"): -1, ("S", "M"): -2, ("Y", "L"): -1, ("Y", "H"): 0,
+    ("Y", "D"): -4, ("E", "R"): -1, ("X", "P"): -1, ("G", "G"): 5,
+    ("G", "C"): -3, ("E", "N"): 1, ("Y", "T"): -3, ("Y", "P"): -5,
+    ("T", "K"): 0, ("A", "A"): 2, ("P", "Q"): 0, ("T", "C"): -2,
+    ("V", "H"): -2, ("T", "G"): 0, ("I", "Q"): -2, ("Z", "T"): -1,
+    ("C", "R"): -4, ("V", "P"): -1, ("P", "E"): -1, ("M", "C"): -5,
+    ("K", "N"): 1, ("I", "I"): 5, ("P", "A"): 1, ("M", "G"): -3,
+    ("T", "S"): 1, ("I", "E"): -2, ("P", "M"): -2, ("M", "K"): 0,
+    ("I", "A"): -1, ("P", "I"): -2, ("R", "R"): 6, ("X", "M"): -1,
+    ("L", "I"): 2, ("X", "I"): -1, ("Z", "B"): 2, ("X", "E"): -1,
+    ("Z", "N"): 1, ("X", "A"): 0, ("B", "R"): -1, ("B", "N"): 2,
+    ("F", "D"): -6, ("X", "Y"): -2, ("Z", "R"): 0, ("F", "H"): -2,
+    ("B", "F"): -4, ("F", "L"): 2, ("X", "Q"): -1, ("B", "B"): 3
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/pam30.cmp
+pam30 = {
+    ("W", "F"): -4, ("L", "R"): -8, ("S", "P"): -2, ("V", "T"): -3,
+    ("Q", "Q"): 8, ("N", "A"): -4, ("Z", "Y"): -9, ("W", "R"): -2,
+    ("Q", "A"): -4, ("S", "D"): -4, ("H", "H"): 9, ("S", "H"): -6,
+    ("H", "D"): -4, ("L", "N"): -7, ("W", "A"): -13, ("Y", "M"): -11,
+    ("G", "R"): -9, ("Y", "I"): -6, ("Y", "E"): -8, ("B", "Y"): -6,
+    ("Y", "A"): -8, ("V", "D"): -8, ("B", "S"): -1, ("Y", "Y"): 10,
+    ("G", "N"): -3, ("E", "C"): -14, ("Y", "Q"): -12, ("Z", "Z"): 6,
+    ("V", "A"): -2, ("C", "C"): 10, ("M", "R"): -4, ("V", "E"): -6,
+    ("T", "N"): -2, ("P", "P"): 8, ("V", "I"): 2, ("V", "S"): -6,
+    ("Z", "P"): -4, ("V", "M"): -1, ("T", "F"): -9, ("V", "Q"): -7,
+    ("K", "K"): 7, ("P", "D"): -8, ("I", "H"): -9, ("I", "D"): -7,
+    ("T", "R"): -6, ("P", "L"): -7, ("K", "G"): -7, ("M", "N"): -9,
+    ("P", "H"): -4, ("F", "Q"): -13, ("Z", "G"): -5, ("X", "L"): -6,
+    ("T", "M"): -4, ("Z", "C"): -14, ("X", "H"): -5, ("D", "R"): -10,
+    ("B", "W"): -10, ("X", "D"): -5, ("Z", "K"): -4, ("F", "A"): -8,
+    ("Z", "W"): -14, ("F", "E"): -14, ("D", "N"): 2, ("B", "K"): -2,
+    ("X", "X"): -5, ("F", "I"): -2, ("B", "G"): -3, ("X", "T"): -4,
+    ("F", "M"): -4, ("B", "C"): -12, ("Z", "I"): -6, ("Z", "V"): -6,
+    ("S", "S"): 6, ("L", "Q"): -5, ("W", "E"): -17, ("Q", "R"): -2,
+    ("N", "N"): 8, ("W", "M"): -13, ("Q", "C"): -14, ("W", "I"): -14,
+    ("S", "C"): -3, ("L", "A"): -6, ("S", "G"): -2, ("L", "E"): -9,
+    ("W", "Q"): -13, ("H", "G"): -9, ("S", "K"): -4, ("Q", "N"): -3,
+    ("N", "R"): -6, ("H", "C"): -7, ("Y", "N"): -4, ("G", "Q"): -7,
+    ("Y", "F"): 2, ("C", "A"): -6, ("V", "L"): -2, ("G", "E"): -4,
+    ("G", "A"): -2, ("K", "R"): 0, ("E", "D"): 2, ("Y", "R"): -10,
+    ("M", "Q"): -4, ("T", "I"): -2, ("C", "D"): -14, ("V", "F"): -8,
+    ("T", "A"): -1, ("T", "P"): -4, ("B", "P"): -7, ("T", "E"): -6,
+    ("V", "N"): -8, ("P", "G"): -6, ("M", "A"): -5, ("K", "H"): -6,
+    ("V", "R"): -8, ("P", "C"): -8, ("M", "E"): -7, ("K", "L"): -8,
+    ("V", "V"): 7, ("M", "I"): -1, ("T", "Q"): -5, ("I", "G"): -11,
+    ("P", "K"): -6, ("M", "M"): 11, ("K", "D"): -4, ("I", "C"): -6,
+    ("Z", "D"): 1, ("F", "R"): -9, ("X", "K"): -5, ("Q", "D"): -2,
+    ("X", "G"): -5, ("Z", "L"): -7, ("X", "C"): -9, ("Z", "H"): -1,
+    ("B", "L"): -9, ("B", "H"): -1, ("F", "F"): 9, ("X", "W"): -11,
+    ("B", "D"): 6, ("D", "A"): -3, ("S", "L"): -8, ("X", "S"): -3,
+    ("F", "N"): -9, ("S", "R"): -3, ("W", "D"): -15, ("V", "Y"): -7,
+    ("W", "L"): -6, ("H", "R"): -2, ("W", "H"): -7, ("H", "N"): 0,
+    ("W", "T"): -13, ("T", "T"): 7, ("S", "F"): -6, ("W", "P"): -14,
+    ("L", "D"): -12, ("B", "I"): -6, ("L", "H"): -6, ("S", "N"): 0,
+    ("B", "T"): -3, ("L", "L"): 7, ("Y", "K"): -9, ("E", "Q"): 1,
+    ("Y", "G"): -14, ("Z", "S"): -5, ("Y", "C"): -4, ("G", "D"): -3,
+    ("B", "V"): -8, ("E", "A"): -2, ("Y", "W"): -5, ("E", "E"): 8,
+    ("Y", "S"): -7, ("C", "N"): -11, ("V", "C"): -6, ("T", "H"): -7,
+    ("P", "R"): -4, ("V", "G"): -5, ("T", "L"): -7, ("V", "K"): -9,
+    ("K", "Q"): -3, ("R", "A"): -7, ("I", "R"): -5, ("T", "D"): -5,
+    ("P", "F"): -10, ("I", "N"): -5, ("K", "I"): -6, ("M", "D"): -11,
+    ("V", "W"): -15, ("W", "W"): 13, ("M", "H"): -10, ("P", "N"): -6,
+    ("K", "A"): -7, ("M", "L"): 1, ("K", "E"): -4, ("Z", "E"): 6,
+    ("X", "N"): -3, ("Z", "A"): -3, ("Z", "M"): -5, ("X", "F"): -8,
+    ("K", "C"): -14, ("B", "Q"): -3, ("X", "B"): -5, ("B", "M"): -10,
+    ("F", "C"): -13, ("Z", "Q"): 6, ("X", "Z"): -5, ("F", "G"): -9,
+    ("B", "E"): 1, ("X", "V"): -5, ("F", "K"): -14, ("B", "A"): -3,
+    ("X", "R"): -6, ("D", "D"): 8, ("W", "G"): -15, ("Z", "F"): -13,
+    ("S", "Q"): -5, ("W", "C"): -15, ("W", "K"): -12, ("H", "Q"): 1,
+    ("L", "C"): -15, ("W", "N"): -8, ("S", "A"): 0, ("L", "G"): -10,
+    ("W", "S"): -5, ("S", "E"): -4, ("H", "E"): -5, ("S", "I"): -7,
+    ("H", "A"): -7, ("S", "M"): -5, ("Y", "L"): -7, ("Y", "H"): -3,
+    ("Y", "D"): -11, ("E", "R"): -9, ("X", "P"): -5, ("G", "G"): 6,
+    ("G", "C"): -9, ("E", "N"): -2, ("Y", "T"): -6, ("Y", "P"): -13,
+    ("T", "K"): -3, ("A", "A"): 6, ("P", "Q"): -3, ("T", "C"): -8,
+    ("V", "H"): -6, ("T", "G"): -6, ("I", "Q"): -8, ("Z", "T"): -6,
+    ("C", "R"): -8, ("V", "P"): -6, ("P", "E"): -5, ("M", "C"): -13,
+    ("K", "N"): -1, ("I", "I"): 8, ("P", "A"): -2, ("M", "G"): -8,
+    ("T", "S"): 0, ("I", "E"): -5, ("P", "M"): -8, ("M", "K"): -2,
+    ("I", "A"): -5, ("P", "I"): -8, ("R", "R"): 8, ("X", "M"): -5,
+    ("L", "I"): -1, ("X", "I"): -5, ("Z", "B"): 0, ("X", "E"): -5,
+    ("Z", "N"): -3, ("X", "A"): -3, ("B", "R"): -7, ("B", "N"): 6,
+    ("F", "D"): -15, ("X", "Y"): -7, ("Z", "R"): -4, ("F", "H"): -6,
+    ("B", "F"): -10, ("F", "L"): -3, ("X", "Q"): -5, ("B", "B"): 6
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/pam300.cmp
+pam300 = {
+    ("W", "F"): 1, ("L", "R"): -3, ("S", "P"): 1, ("V", "T"): 0,
+    ("Q", "Q"): 4, ("N", "A"): 0, ("Z", "Y"): -5, ("W", "R"): 3,
+    ("Q", "A"): 0, ("S", "D"): 0, ("H", "H"): 7, ("S", "H"): -1,
+    ("H", "D"): 1, ("L", "N"): -3, ("W", "A"): -6, ("Y", "M"): -2,
+    ("G", "R"): -2, ("Y", "I"): -1, ("Y", "E"): -5, ("B", "Y"): -4,
+    ("Y", "A"): -4, ("V", "D"): -2, ("B", "S"): 1, ("Y", "Y"): 12,
+    ("G", "N"): 1, ("E", "C"): -6, ("Y", "Q"): -4, ("Z", "Z"): 3,
+    ("V", "A"): 0, ("C", "C"): 15, ("M", "R"): 0, ("V", "E"): -2,
+    ("T", "N"): 0, ("P", "P"): 6, ("V", "I"): 4, ("V", "S"): -1,
+    ("Z", "P"): 0, ("V", "M"): 2, ("T", "F"): -3, ("V", "Q"): -2,
+    ("K", "K"): 5, ("P", "D"): -1, ("I", "H"): -2, ("I", "D"): -2,
+    ("T", "R"): -1, ("P", "L"): -3, ("K", "G"): -2, ("M", "N"): -2,
+    ("P", "H"): 0, ("F", "Q"): -5, ("Z", "G"): 0, ("X", "L"): -1,
+    ("T", "M"): -1, ("Z", "C"): -6, ("X", "H"): 0, ("D", "R"): -1,
+    ("B", "W"): -6, ("X", "D"): -1, ("Z", "K"): 1, ("F", "A"): -4,
+    ("Z", "W"): -6, ("F", "E"): -6, ("D", "N"): 2, ("B", "K"): 1,
+    ("X", "X"): -1, ("F", "I"): 1, ("B", "G"): 1, ("X", "T"): 0,
+    ("F", "M"): 1, ("B", "C"): -5, ("Z", "I"): -2, ("Z", "V"): -2,
+    ("S", "S"): 1, ("L", "Q"): -2, ("W", "E"): -8, ("Q", "R"): 2,
+    ("N", "N"): 2, ("W", "M"): -5, ("Q", "C"): -6, ("W", "I"): -6,
+    ("S", "C"): 0, ("L", "A"): -2, ("S", "G"): 1, ("L", "E"): -4,
+    ("W", "Q"): -5, ("H", "G"): -2, ("S", "K"): 0, ("Q", "N"): 1,
+    ("N", "R"): 0, ("H", "C"): -4, ("Y", "N"): -2, ("G", "Q"): -1,
+    ("Y", "F"): 9, ("C", "A"): -2, ("V", "L"): 2, ("G", "E"): 0,
+    ("G", "A"): 2, ("K", "R"): 4, ("E", "D"): 4, ("Y", "R"): -5,
+    ("M", "Q"): -1, ("T", "I"): 0, ("C", "D"): -6, ("V", "F"): -1,
+    ("T", "A"): 1, ("T", "P"): 1, ("B", "P"): 0, ("T", "E"): 0,
+    ("V", "N"): -2, ("P", "G"): 0, ("M", "A"): -1, ("K", "H"): 0,
+    ("V", "R"): -3, ("P", "C"): -3, ("M", "E"): -2, ("K", "L"): -3,
+    ("V", "V"): 5, ("M", "I"): 3, ("T", "Q"): -1, ("I", "G"): -3,
+    ("P", "K"): -1, ("M", "M"): 6, ("K", "D"): 0, ("I", "C"): -3,
+    ("Z", "D"): 3, ("F", "R"): -5, ("X", "K"): -1, ("Q", "D"): 2,
+    ("X", "G"): -1, ("Z", "L"): -3, ("X", "C"): -3, ("Z", "H"): 2,
+    ("B", "L"): -4, ("B", "H"): 1, ("F", "F"): 11, ("X", "W"): -4,
+    ("B", "D"): 3, ("D", "A"): 0, ("S", "L"): -3, ("X", "S"): 0,
+    ("F", "N"): -4, ("S", "R"): 0, ("W", "D"): -7, ("V", "Y"): -3,
+    ("W", "L"): -2, ("H", "R"): 2, ("W", "H"): -3, ("H", "N"): 2,
+    ("W", "T"): -6, ("T", "T"): 2, ("S", "F"): -4, ("W", "P"): -6,
+    ("L", "D"): -4, ("B", "I"): -2, ("L", "H"): -2, ("S", "N"): 1,
+    ("B", "T"): 0, ("L", "L"): 7, ("Y", "K"): -5, ("E", "Q"): 3,
+    ("Y", "G"): -6, ("Z", "S"): 0, ("Y", "C"): 1, ("G", "D"): 1,
+    ("B", "V"): -2, ("E", "A"): 0, ("Y", "W"): 0, ("E", "E"): 4,
+    ("Y", "S"): -3, ("C", "N"): -4, ("V", "C"): -2, ("T", "H"): -1,
+    ("P", "R"): 0, ("V", "G"): -1, ("T", "L"): -2, ("V", "K"): -2,
+    ("K", "Q"): 1, ("R", "A"): -1, ("I", "R"): -2, ("T", "D"): 0,
+    ("P", "F"): -5, ("I", "N"): -2, ("K", "I"): -2, ("M", "D"): -3,
+    ("V", "W"): -7, ("W", "W"): 22, ("M", "H"): -2, ("P", "N"): 0,
+    ("K", "A"): -1, ("M", "L"): 4, ("K", "E"): 0, ("Z", "E"): 3,
+    ("X", "N"): 0, ("Z", "A"): 0, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -6, ("B", "Q"): 2, ("X", "B"): 0, ("B", "M"): -2,
+    ("F", "C"): -5, ("Z", "Q"): 3, ("X", "Z"): -1, ("F", "G"): -5,
+    ("B", "E"): 3, ("X", "V"): 0, ("F", "K"): -6, ("B", "A"): 0,
+    ("X", "R"): -1, ("D", "D"): 4, ("W", "G"): -8, ("Z", "F"): -5,
+    ("S", "Q"): 0, ("W", "C"): -9, ("W", "K"): -4, ("H", "Q"): 3,
+    ("L", "C"): -7, ("W", "N"): -5, ("S", "A"): 1, ("L", "G"): -4,
+    ("W", "S"): -3, ("S", "E"): 0, ("H", "E"): 1, ("S", "I"): -1,
+    ("H", "A"): -1, ("S", "M"): -2, ("Y", "L"): 0, ("Y", "H"): 0,
+    ("Y", "D"): -5, ("E", "R"): -1, ("X", "P"): -1, ("G", "G"): 5,
+    ("G", "C"): -4, ("E", "N"): 2, ("Y", "T"): -3, ("Y", "P"): -5,
+    ("T", "K"): 0, ("A", "A"): 2, ("P", "Q"): 0, ("T", "C"): -2,
+    ("V", "H"): -2, ("T", "G"): 0, ("I", "Q"): -2, ("Z", "T"): 0,
+    ("C", "R"): -4, ("V", "P"): -1, ("P", "E"): 0, ("M", "C"): -6,
+    ("K", "N"): 1, ("I", "I"): 5, ("P", "A"): 1, ("M", "G"): -3,
+    ("T", "S"): 1, ("I", "E"): -2, ("P", "M"): -2, ("M", "K"): 0,
+    ("I", "A"): 0, ("P", "I"): -2, ("R", "R"): 7, ("X", "M"): -1,
+    ("L", "I"): 3, ("X", "I"): -1, ("Z", "B"): 2, ("X", "E"): -1,
+    ("Z", "N"): 1, ("X", "A"): 0, ("B", "R"): 0, ("B", "N"): 2,
+    ("F", "D"): -6, ("X", "Y"): -2, ("Z", "R"): 0, ("F", "H"): -2,
+    ("B", "F"): -5, ("F", "L"): 3, ("X", "Q"): 0, ("B", "B"): 3
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/pam60.cmp
+pam60 = {
+    ("W", "F"): -3, ("L", "R"): -6, ("S", "P"): 0, ("V", "T"): -1,
+    ("Q", "Q"): 7, ("N", "A"): -2, ("Z", "Y"): -7, ("W", "R"): 0,
+    ("Q", "A"): -3, ("S", "D"): -2, ("H", "H"): 8, ("S", "H"): -4,
+    ("H", "D"): -2, ("L", "N"): -5, ("W", "A"): -10, ("Y", "M"): -7,
+    ("G", "R"): -7, ("Y", "I"): -4, ("Y", "E"): -7, ("B", "Y"): -5,
+    ("Y", "A"): -6, ("V", "D"): -6, ("B", "S"): 0, ("Y", "Y"): 9,
+    ("G", "N"): -1, ("E", "C"): -10, ("Y", "Q"): -8, ("Z", "Z"): 5,
+    ("V", "A"): -1, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -4,
+    ("T", "N"): -1, ("P", "P"): 7, ("V", "I"): 3, ("V", "S"): -4,
+    ("Z", "P"): -2, ("V", "M"): 0, ("T", "F"): -6, ("V", "Q"): -5,
+    ("K", "K"): 6, ("P", "D"): -5, ("I", "H"): -6, ("I", "D"): -5,
+    ("T", "R"): -4, ("P", "L"): -5, ("K", "G"): -5, ("M", "N"): -6,
+    ("P", "H"): -2, ("F", "Q"): -9, ("Z", "G"): -3, ("X", "L"): -4,
+    ("T", "M"): -2, ("Z", "C"): -10, ("X", "H"): -3, ("D", "R"): -6,
+    ("B", "W"): -8, ("X", "D"): -3, ("Z", "K"): -2, ("F", "A"): -6,
+    ("Z", "W"): -11, ("F", "E"): -10, ("D", "N"): 2, ("B", "K"): -1,
+    ("X", "X"): -3, ("F", "I"): -1, ("B", "G"): -2, ("X", "T"): -2,
+    ("F", "M"): -2, ("B", "C"): -9, ("Z", "I"): -4, ("Z", "V"): -5,
+    ("S", "S"): 5, ("L", "Q"): -3, ("W", "E"): -12, ("Q", "R"): 0,
+    ("N", "N"): 6, ("W", "M"): -9, ("Q", "C"): -10, ("W", "I"): -10,
+    ("S", "C"): -1, ("L", "A"): -4, ("S", "G"): 0, ("L", "E"): -7,
+    ("W", "Q"): -9, ("H", "G"): -6, ("S", "K"): -2, ("Q", "N"): -2,
+    ("N", "R"): -3, ("H", "C"): -6, ("Y", "N"): -3, ("G", "Q"): -5,
+    ("Y", "F"): 3, ("C", "A"): -5, ("V", "L"): -1, ("G", "E"): -2,
+    ("G", "A"): 0, ("K", "R"): 2, ("E", "D"): 3, ("Y", "R"): -8,
+    ("M", "Q"): -2, ("T", "I"): -1, ("C", "D"): -10, ("V", "F"): -5,
+    ("T", "A"): 1, ("T", "P"): -2, ("B", "P"): -4, ("T", "E"): -4,
+    ("V", "N"): -5, ("P", "G"): -4, ("M", "A"): -3, ("K", "H"): -4,
+    ("V", "R"): -5, ("P", "C"): -6, ("M", "E"): -5, ("K", "L"): -6,
+    ("V", "V"): 6, ("M", "I"): 1, ("T", "Q"): -4, ("I", "G"): -7,
+    ("P", "K"): -4, ("M", "M"): 10, ("K", "D"): -2, ("I", "C"): -4,
+    ("Z", "D"): 2, ("F", "R"): -7, ("X", "K"): -3, ("Q", "D"): -1,
+    ("X", "G"): -3, ("Z", "L"): -5, ("X", "C"): -6, ("Z", "H"): 0,
+    ("B", "L"): -7, ("B", "H"): 0, ("F", "F"): 8, ("X", "W"): -8,
+    ("B", "D"): 5, ("D", "A"): -2, ("S", "L"): -6, ("X", "S"): -2,
+    ("F", "N"): -6, ("S", "R"): -2, ("W", "D"): -11, ("V", "Y"): -5,
+    ("W", "L"): -4, ("H", "R"): 0, ("W", "H"): -5, ("H", "N"): 1,
+    ("W", "T"): -9, ("T", "T"): 6, ("S", "F"): -5, ("W", "P"): -10,
+    ("L", "D"): -9, ("B", "I"): -4, ("L", "H"): -4, ("S", "N"): 1,
+    ("B", "T"): -2, ("L", "L"): 6, ("Y", "K"): -7, ("E", "Q"): 2,
+    ("Y", "G"): -10, ("Z", "S"): -3, ("Y", "C"): -2, ("G", "D"): -2,
+    ("B", "V"): -5, ("E", "A"): -1, ("Y", "W"): -3, ("E", "E"): 7,
+    ("Y", "S"): -5, ("C", "N"): -7, ("V", "C"): -4, ("T", "H"): -5,
+    ("P", "R"): -2, ("V", "G"): -4, ("T", "L"): -5, ("V", "K"): -6,
+    ("K", "Q"): -1, ("R", "A"): -5, ("I", "R"): -4, ("T", "D"): -3,
+    ("P", "F"): -7, ("I", "N"): -4, ("K", "I"): -4, ("M", "D"): -7,
+    ("V", "W"): -11, ("W", "W"): 13, ("M", "H"): -7, ("P", "N"): -4,
+    ("K", "A"): -5, ("M", "L"): 2, ("K", "E"): -3, ("Z", "E"): 5,
+    ("X", "N"): -2, ("Z", "A"): -2, ("Z", "M"): -4, ("X", "F"): -5,
+    ("K", "C"): -10, ("B", "Q"): -1, ("X", "B"): -3, ("B", "M"): -6,
+    ("F", "C"): -9, ("Z", "Q"): 6, ("X", "Z"): -3, ("F", "G"): -7,
+    ("B", "E"): 2, ("X", "V"): -3, ("F", "K"): -10, ("B", "A"): -2,
+    ("X", "R"): -4, ("D", "D"): 7, ("W", "G"): -11, ("Z", "F"): -10,
+    ("S", "Q"): -3, ("W", "C"): -12, ("W", "K"): -8, ("H", "Q"): 2,
+    ("L", "C"): -11, ("W", "N"): -6, ("S", "A"): 1, ("L", "G"): -8,
+    ("W", "S"): -4, ("S", "E"): -2, ("H", "E"): -3, ("S", "I"): -4,
+    ("H", "A"): -5, ("S", "M"): -4, ("Y", "L"): -5, ("Y", "H"): -2,
+    ("Y", "D"): -8, ("E", "R"): -6, ("X", "P"): -3, ("G", "G"): 6,
+    ("G", "C"): -7, ("E", "N"): 0, ("Y", "T"): -5, ("Y", "P"): -10,
+    ("T", "K"): -2, ("A", "A"): 5, ("P", "Q"): -1, ("T", "C"): -5,
+    ("V", "H"): -5, ("T", "G"): -3, ("I", "Q"): -5, ("Z", "T"): -4,
+    ("C", "R"): -6, ("V", "P"): -4, ("P", "E"): -3, ("M", "C"): -10,
+    ("K", "N"): 0, ("I", "I"): 7, ("P", "A"): 0, ("M", "G"): -6,
+    ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -6, ("M", "K"): 0,
+    ("I", "A"): -3, ("P", "I"): -6, ("R", "R"): 8, ("X", "M"): -3,
+    ("L", "I"): 0, ("X", "I"): -3, ("Z", "B"): 1, ("X", "E"): -3,
+    ("Z", "N"): -1, ("X", "A"): -2, ("B", "R"): -5, ("B", "N"): 5,
+    ("F", "D"): -11, ("X", "Y"): -5, ("Z", "R"): -2, ("F", "H"): -4,
+    ("B", "F"): -8, ("F", "L"): -1, ("X", "Q"): -3, ("B", "B"): 5
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/pam90.cmp
+pam90 = {
+    ("W", "F"): -2, ("L", "R"): -5, ("S", "P"): 0, ("V", "T"): -1,
+    ("Q", "Q"): 6, ("N", "A"): -1, ("Z", "Y"): -6, ("W", "R"): 0,
+    ("Q", "A"): -2, ("S", "D"): -1, ("H", "H"): 8, ("S", "H"): -3,
+    ("H", "D"): -1, ("L", "N"): -4, ("W", "A"): -8, ("Y", "M"): -6,
+    ("G", "R"): -5, ("Y", "I"): -3, ("Y", "E"): -6, ("B", "Y"): -4,
+    ("Y", "A"): -5, ("V", "D"): -4, ("B", "S"): 0, ("Y", "Y"): 9,
+    ("G", "N"): -1, ("E", "C"): -8, ("Y", "Q"): -6, ("Z", "Z"): 5,
+    ("V", "A"): 0, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): 0, ("P", "P"): 7, ("V", "I"): 3, ("V", "S"): -3,
+    ("Z", "P"): -2, ("V", "M"): 1, ("T", "F"): -5, ("V", "Q"): -4,
+    ("K", "K"): 5, ("P", "D"): -4, ("I", "H"): -5, ("I", "D"): -4,
+    ("T", "R"): -3, ("P", "L"): -4, ("K", "G"): -4, ("M", "N"): -4,
+    ("P", "H"): -2, ("F", "Q"): -7, ("Z", "G"): -2, ("T", "M"): -2,
+    ("Z", "C"): -8, ("D", "R"): -5, ("B", "W"): -7, ("Z", "K"): -1,
+    ("F", "A"): -5, ("Z", "W"): -8, ("F", "E"): -8, ("D", "N"): 3,
+    ("B", "K"): 0, ("F", "I"): 0, ("B", "G"): -1, ("F", "M"): -1,
+    ("B", "C"): -7, ("Z", "I"): -3, ("Z", "V"): -3, ("S", "S"): 4,
+    ("L", "Q"): -3, ("W", "E"): -10, ("Q", "R"): 0, ("N", "N"): 5,
+    ("W", "M"): -7, ("Q", "C"): -8, ("W", "I"): -8, ("S", "C"): -1,
+    ("L", "A"): -3, ("S", "G"): 0, ("L", "E"): -5, ("W", "Q"): -7,
+    ("H", "G"): -5, ("S", "K"): -1, ("L", "I"): 1, ("N", "R"): -2,
+    ("H", "C"): -5, ("Y", "N"): -2, ("G", "Q"): -3, ("Y", "F"): 4,
+    ("C", "A"): -3, ("V", "L"): 0, ("G", "E"): -1, ("G", "A"): 0,
+    ("K", "R"): 2, ("E", "D"): 4, ("Y", "R"): -6, ("M", "Q"): -2,
+    ("T", "I"): 0, ("C", "D"): -8, ("V", "F"): -4, ("T", "A"): 1,
+    ("T", "P"): -1, ("B", "P"): -3, ("T", "E"): -2, ("V", "N"): -4,
+    ("P", "G"): -3, ("M", "A"): -2, ("K", "H"): -2, ("V", "R"): -4,
+    ("P", "C"): -5, ("M", "E"): -4, ("K", "L"): -5, ("V", "V"): 6,
+    ("M", "I"): 1, ("T", "Q"): -3, ("I", "G"): -5, ("P", "K"): -3,
+    ("M", "M"): 9, ("K", "D"): -2, ("I", "C"): -3, ("Z", "D"): 3,
+    ("F", "R"): -6, ("Q", "D"): 0, ("Z", "L"): -4, ("Z", "H"): 1,
+    ("B", "L"): -5, ("B", "H"): 1, ("F", "F"): 8, ("B", "D"): 5,
+    ("D", "A"): -1, ("S", "L"): -5, ("F", "N"): -5, ("S", "R"): -1,
+    ("W", "D"): -9, ("W", "L"): -3, ("H", "R"): 1, ("W", "H"): -4,
+    ("H", "N"): 2, ("W", "T"): -7, ("T", "T"): 5, ("S", "F"): -4,
+    ("W", "P"): -8, ("L", "D"): -7, ("B", "I"): -3, ("L", "H"): -3,
+    ("S", "N"): 1, ("B", "T"): -1, ("L", "L"): 6, ("Y", "K"): -6,
+    ("E", "Q"): 2, ("Y", "G"): -8, ("Z", "S"): -2, ("Y", "C"): -1,
+    ("G", "D"): -1, ("B", "V"): -4, ("E", "A"): 0, ("Y", "W"): -2,
+    ("E", "E"): 6, ("Y", "S"): -4, ("C", "N"): -6, ("V", "C"): -3,
+    ("T", "H"): -3, ("P", "R"): -1, ("V", "G"): -3, ("T", "L"): -3,
+    ("V", "K"): -5, ("K", "Q"): -1, ("R", "A"): -4, ("I", "R"): -3,
+    ("T", "D"): -2, ("P", "F"): -6, ("I", "N"): -3, ("K", "I"): -3,
+    ("M", "D"): -5, ("V", "W"): -9, ("W", "W"): 13, ("M", "H"): -5,
+    ("P", "N"): -2, ("K", "A"): -3, ("M", "L"): 2, ("K", "E"): -2,
+    ("Z", "E"): 5, ("Q", "N"): -1, ("Z", "A"): -1, ("Z", "M"): -3,
+    ("K", "C"): -8, ("B", "Q"): 0, ("B", "M"): -5, ("F", "C"): -7,
+    ("Z", "Q"): 5, ("F", "G"): -6, ("B", "E"): 2, ("F", "K"): -8,
+    ("B", "A"): -1, ("D", "D"): 6, ("W", "G"): -9, ("S", "Q"): -2,
+    ("W", "C"): -10, ("W", "K"): -6, ("H", "Q"): 2, ("L", "C"): -9,
+    ("W", "N"): -5, ("S", "A"): 1, ("L", "G"): -6, ("W", "S"): -3,
+    ("S", "E"): -2, ("H", "E"): -1, ("S", "I"): -3, ("H", "A"): -4,
+    ("S", "M"): -3, ("Y", "L"): -3, ("Y", "H"): -1, ("Y", "D"): -6,
+    ("E", "R"): -4, ("G", "G"): 5, ("G", "C"): -5, ("E", "N"): 0,
+    ("Y", "T"): -4, ("Y", "P"): -8, ("T", "K"): -1, ("A", "A"): 4,
+    ("P", "Q"): -1, ("T", "C"): -4, ("V", "H"): -4, ("T", "G"): -2,
+    ("I", "Q"): -4, ("Z", "T"): -2, ("C", "R"): -5, ("V", "P"): -3,
+    ("P", "E"): -2, ("M", "C"): -8, ("K", "N"): 1, ("I", "I"): 6,
+    ("P", "A"): 0, ("M", "G"): -5, ("T", "S"): 2, ("I", "E"): -3,
+    ("P", "M"): -4, ("M", "K"): 0, ("I", "A"): -2, ("P", "I"): -4,
+    ("R", "R"): 7, ("Z", "F"): -8, ("Z", "B"): 2, ("Z", "N"): 0,
+    ("B", "R"): -3, ("B", "N"): 4, ("F", "D"): -8, ("Z", "R"): -1,
+    ("F", "H"): -3, ("B", "F"): -6, ("F", "L"): 0, ("V", "Y"): -4,
+    ("B", "B"): 4
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/rao.cmp
+rao = {
+    ("W", "F"): 11, ("S", "P"): 10, ("N", "M"): 6, ("Q", "Q"): 16,
+    ("N", "A"): 9, ("N", "E"): 10, ("W", "V"): 11, ("Q", "E"): 11,
+    ("L", "H"): 10, ("W", "R"): 7, ("Q", "A"): 11, ("S", "D"): 10,
+    ("H", "H"): 16, ("Q", "M"): 9, ("S", "H"): 10, ("H", "D"): 9,
+    ("Q", "I"): 6, ("S", "L"): 8, ("Y", "M"): 8, ("Y", "I"): 10,
+    ("Y", "E"): 6, ("Y", "A"): 9, ("G", "F"): 7, ("V", "T"): 10,
+    ("Y", "Y"): 16, ("V", "H"): 9, ("E", "C"): 9, ("Y", "Q"): 8,
+    ("V", "A"): 9, ("C", "C"): 16, ("V", "E"): 4, ("T", "N"): 10,
+    ("R", "K"): 11, ("P", "P"): 16, ("V", "I"): 12, ("R", "G"): 7,
+    ("V", "M"): 9, ("T", "F"): 10, ("R", "C"): 8, ("V", "Q"): 6,
+    ("K", "K"): 16, ("P", "D"): 8, ("I", "H"): 8, ("M", "F"): 10,
+    ("I", "D"): 3, ("K", "C"): 9, ("P", "L"): 4, ("K", "G"): 7,
+    ("P", "H"): 5, ("T", "R"): 9, ("F", "A"): 10, ("F", "E"): 6,
+    ("S", "S"): 16, ("W", "E"): 7, ("N", "N"): 16, ("W", "M"): 10,
+    ("Q", "C"): 10, ("N", "F"): 6, ("S", "C"): 10, ("L", "A"): 11,
+    ("S", "G"): 11, ("L", "E"): 7, ("W", "Q"): 9, ("H", "G"): 7,
+    ("S", "K"): 10, ("Q", "N"): 11, ("V", "D"): 3, ("H", "C"): 10,
+    ("Y", "N"): 8, ("Y", "F"): 10, ("W", "I"): 11, ("C", "A"): 11,
+    ("G", "E"): 6, ("G", "A"): 8, ("Y", "V"): 10, ("E", "D"): 11,
+    ("W", "H"): 10, ("Y", "R"): 7, ("N", "I"): 5, ("R", "L"): 6,
+    ("T", "I"): 10, ("Q", "L"): 9, ("R", "H"): 10, ("T", "M"): 8,
+    ("V", "F"): 11, ("R", "D"): 10, ("T", "A"): 10, ("T", "P"): 8,
+    ("T", "E"): 8, ("V", "N"): 5, ("P", "G"): 11, ("M", "A"): 11,
+    ("K", "H"): 11, ("V", "R"): 5, ("P", "C"): 7, ("M", "E"): 8,
+    ("V", "V"): 16, ("T", "T"): 16, ("M", "I"): 9, ("T", "Q"): 10,
+    ("I", "G"): 6, ("P", "K"): 6, ("M", "M"): 16, ("K", "D"): 11,
+    ("I", "C"): 8, ("L", "C"): 11, ("F", "F"): 16, ("D", "A"): 9,
+    ("S", "R"): 9, ("W", "D"): 6, ("N", "C"): 9, ("N", "G"): 10,
+    ("W", "T"): 11, ("Q", "G"): 8, ("S", "F"): 8, ("W", "P"): 6,
+    ("L", "D"): 6, ("H", "F"): 9, ("Q", "K"): 12, ("S", "N"): 11,
+    ("L", "L"): 16, ("Q", "F"): 7, ("Y", "K"): 7, ("Y", "G"): 10,
+    ("Y", "C"): 10, ("G", "D"): 9, ("E", "A"): 10, ("Y", "W"): 11,
+    ("E", "E"): 16, ("Y", "S"): 11, ("R", "M"): 6, ("V", "C"): 8,
+    ("T", "H"): 10, ("R", "I"): 4, ("V", "G"): 6, ("T", "L"): 9,
+    ("R", "E"): 9, ("V", "K"): 5, ("R", "Q"): 10, ("R", "A"): 8,
+    ("T", "D"): 9, ("P", "F"): 4, ("V", "S"): 8, ("K", "I"): 4,
+    ("M", "D"): 5, ("W", "W"): 16, ("M", "H"): 10, ("P", "N"): 9,
+    ("I", "F"): 12, ("K", "A"): 10, ("M", "L"): 11, ("K", "E"): 11,
+    ("N", "K"): 11, ("R", "P"): 6, ("L", "F"): 11, ("F", "C"): 10,
+    ("W", "G"): 8, ("W", "L"): 11, ("D", "D"): 16, ("N", "H"): 10,
+    ("S", "Q"): 10, ("Q", "P"): 7, ("N", "L"): 7, ("W", "K"): 7,
+    ("Q", "D"): 11, ("W", "N"): 8, ("S", "A"): 10, ("L", "G"): 6,
+    ("W", "S"): 10, ("S", "E"): 9, ("L", "K"): 7, ("H", "E"): 11,
+    ("S", "I"): 8, ("Q", "H"): 11, ("H", "A"): 11, ("S", "M"): 7,
+    ("Y", "L"): 9, ("Y", "H"): 9, ("Y", "D"): 7, ("G", "G"): 16,
+    ("G", "C"): 8, ("Y", "T"): 11, ("W", "C"): 11, ("Y", "P"): 8,
+    ("T", "K"): 9, ("R", "N"): 10, ("A", "A"): 16, ("W", "A"): 11,
+    ("T", "C"): 10, ("N", "D"): 11, ("R", "F"): 5, ("T", "G"): 10,
+    ("V", "L"): 10, ("V", "P"): 3, ("P", "E"): 5, ("M", "C"): 10,
+    ("I", "I"): 16, ("P", "A"): 6, ("M", "G"): 4, ("T", "S"): 11,
+    ("I", "E"): 4, ("P", "M"): 2, ("M", "K"): 8, ("K", "F"): 6,
+    ("I", "A"): 9, ("P", "I"): 3, ("R", "R"): 16, ("L", "I"): 10,
+    ("F", "D"): 4, ("D", "C"): 8
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/risler.cmp
+risler = {
+    ("W", "F"): -0.9, ("S", "P"): -0.3, ("N", "M"): 0.0, ("Q", "Q"): 2.2,
+    ("N", "A"): 1.3, ("N", "E"): 1.4, ("W", "V"): -0.7, ("Q", "E"): 2.1,
+    ("L", "H"): -0.9, ("W", "R"): -0.8, ("Q", "A"): 1.8, ("S", "D"): 0.7,
+    ("H", "H"): 2.2, ("Q", "M"): 1.2, ("S", "H"): -0.4, ("H", "D"): -1.3,
+    ("Q", "I"): 1.4, ("S", "L"): 1.3, ("Y", "M"): -0.2, ("Y", "I"): 0.4,
+    ("Y", "E"): 0.2, ("Y", "A"): 0.2, ("G", "F"): -0.4, ("V", "T"): 1.6,
+    ("Y", "Y"): 2.2, ("V", "H"): -0.7, ("E", "C"): -1.5, ("Y", "Q"): 0.5,
+    ("V", "A"): 2.0, ("C", "C"): 2.2, ("V", "E"): 1.6, ("T", "N"): 1.1,
+    ("R", "K"): 2.1, ("P", "P"): 2.2, ("V", "I"): 2.2, ("R", "G"): 0.1,
+    ("V", "M"): 0.8, ("T", "F"): 0.3, ("R", "C"): -1.5, ("V", "Q"): 1.5,
+    ("K", "K"): 2.2, ("P", "D"): -1.2, ("I", "H"): -0.8, ("M", "F"): -0.2,
+    ("I", "D"): 0.0, ("K", "C"): -1.6, ("P", "L"): -0.8, ("K", "G"): -0.1,
+    ("P", "H"): -1.6, ("T", "R"): 1.9, ("F", "A"): 0.6, ("F", "E"): 0.6,
+    ("S", "S"): 2.2, ("W", "E"): -1.0, ("N", "N"): 2.2, ("W", "M"): -1.3,
+    ("Q", "C"): -1.4, ("N", "F"): 0.4, ("S", "C"): -1.3, ("L", "A"): 1.3,
+    ("S", "G"): 0.7, ("L", "E"): 0.9, ("W", "Q"): -1.0, ("H", "G"): -1.2,
+    ("S", "K"): 1.4, ("Q", "N"): 1.6, ("V", "D"): 0.0, ("H", "C"): -1.8,
+    ("Y", "N"): -0.1, ("Y", "F"): 2.0, ("W", "I"): -0.7, ("C", "A"): -1.5,
+    ("G", "E"): 0.3, ("G", "A"): 0.6, ("Y", "V"): 0.3, ("E", "D"): 1.0,
+    ("W", "H"): -1.7, ("Y", "R"): 0.8, ("N", "I"): 0.9, ("R", "L"): 1.2,
+    ("T", "I"): 1.6, ("Q", "L"): 1.1, ("R", "H"): -0.4, ("T", "M"): 0.8,
+    ("V", "F"): 0.8, ("R", "D"): -0.1, ("T", "A"): 1.9, ("T", "P"): -0.5,
+    ("T", "E"): 1.6, ("V", "N"): 1.1, ("P", "G"): -1.2, ("M", "A"): 1.0,
+    ("K", "H"): -1.0, ("V", "R"): 1.5, ("P", "C"): -1.8, ("M", "E"): 0.6,
+    ("V", "V"): 2.2, ("T", "T"): 2.2, ("M", "I"): 0.9, ("T", "Q"): 1.7,
+    ("I", "G"): 0.0, ("P", "K"): -0.7, ("M", "M"): 2.2, ("K", "D"): 0.1,
+    ("I", "C"): -1.6, ("L", "C"): -1.5, ("F", "F"): 2.2, ("D", "A"): 0.2,
+    ("S", "R"): 2.0, ("W", "D"): -1.4, ("N", "C"): -1.6, ("N", "G"): 0.2,
+    ("W", "T"): -1.0, ("Q", "G"): 0.2, ("S", "F"): 0.5, ("W", "P"): -1.6,
+    ("L", "D"): -0.2, ("H", "F"): -1.1, ("Q", "K"): 1.7, ("S", "N"): 1.9,
+    ("L", "L"): 2.2, ("Q", "F"): 0.7, ("Y", "K"): 0.5, ("Y", "G"): -0.2,
+    ("Y", "C"): -1.1, ("G", "D"): -0.4, ("E", "A"): 1.7, ("Y", "W"): -0.6,
+    ("E", "E"): 2.2, ("Y", "S"): 0.4, ("R", "M"): 1.1, ("V", "C"): -1.4,
+    ("T", "H"): -0.9, ("R", "I"): 1.4, ("V", "G"): 0.1, ("T", "L"): 1.2,
+    ("R", "E"): 1.9, ("V", "K"): 1.2, ("R", "Q"): 2.0, ("R", "A"): 1.5,
+    ("T", "D"): 0.0, ("P", "F"): -1.1, ("V", "S"): 1.8, ("K", "I"): 1.0,
+    ("M", "D"): -0.5, ("W", "W"): 2.2, ("M", "H"): -1.2, ("P", "N"): -1.0,
+    ("I", "F"): 1.0, ("K", "A"): 1.4, ("M", "L"): 1.8, ("K", "E"): 1.4,
+    ("N", "K"): 1.0, ("R", "P"): -0.3, ("L", "F"): 1.0, ("F", "C"): -1.6,
+    ("W", "G"): -1.3, ("W", "L"): -0.8, ("D", "D"): 2.2, ("N", "H"): -0.3,
+    ("S", "Q"): 1.8, ("Q", "P"): -0.6, ("N", "L"): 0.8, ("W", "K"): -1.1,
+    ("Q", "D"): 0.6, ("W", "N"): -1.1, ("S", "A"): 2.0, ("L", "G"): -0.2,
+    ("W", "S"): -0.8, ("S", "E"): 1.8, ("L", "K"): 0.7, ("H", "E"): -0.6,
+    ("S", "I"): 1.6, ("Q", "H"): -0.5, ("H", "A"): -0.6, ("S", "M"): 0.6,
+    ("Y", "L"): 0.5, ("Y", "H"): -0.8, ("Y", "D"): -0.4, ("G", "G"): 2.2,
+    ("G", "C"): -1.7, ("Y", "T"): 0.3, ("W", "C"): -1.8, ("Y", "P"): -1.2,
+    ("T", "K"): 1.2, ("R", "N"): 1.2, ("A", "A"): 2.2, ("W", "A"): -0.9,
+    ("T", "C"): -1.4, ("N", "D"): 0.8, ("R", "F"): 0.4, ("T", "G"): 0.2,
+    ("V", "L"): 2.0, ("V", "P"): -0.6, ("P", "E"): -0.1, ("M", "C"): -1.6,
+    ("I", "I"): 2.2, ("P", "A"): -0.2, ("M", "G"): -0.4, ("T", "S"): 2.1,
+    ("I", "E"): 1.5, ("P", "M"): -1.2, ("M", "K"): 0.4, ("K", "F"): 0.1,
+    ("I", "A"): 1.7, ("P", "I"): -0.6, ("R", "R"): 2.2, ("L", "I"): 2.1,
+    ("F", "D"): -0.3, ("D", "C"): -1.7
+}
+
+
+# http://www.embl-heidelberg.de/~vogt/matrices/str.cmp
+structure = {
+    ("W", "F"): 2, ("L", "R"): -3, ("I", "I"): 6, ("Q", "Q"): 6,
+    ("W", "N"): -5, ("V", "I"): 2, ("H", "T"): -2, ("H", "P"): -3,
+    ("W", "V"): -4, ("Q", "E"): 2, ("W", "R"): -2, ("Q", "A"): 0,
+    ("H", "H"): 8, ("H", "D"): 0, ("L", "N"): -3, ("Y", "M"): -1,
+    ("Y", "I"): -1, ("Y", "E"): -2, ("E", "S"): -1, ("Y", "A"): -3,
+    ("Y", "Y"): 7, ("T", "C"): -5, ("E", "C"): -3, ("Y", "Q"): -3,
+    ("E", "G"): -2, ("V", "A"): 0, ("C", "C"): 11, ("M", "R"): -4,
+    ("P", "T"): -1, ("V", "E"): -2, ("P", "P"): 7, ("I", "T"): -2,
+    ("K", "S"): -1, ("R", "G"): -2, ("I", "P"): -4, ("R", "C"): -2,
+    ("A", "T"): -1, ("K", "K"): 5, ("A", "P"): -1, ("V", "M"): 0,
+    ("I", "D"): -3, ("K", "C"): -4, ("K", "G"): -3, ("R", "S"): 0,
+    ("F", "Q"): -4, ("F", "A"): -3, ("V", "V"): 5, ("M", "N"): -2,
+    ("F", "E"): -4, ("D", "N"): 2, ("F", "I"): 1, ("F", "M"): 0,
+    ("M", "S"): -4, ("S", "S"): 4, ("L", "Q"): -3, ("W", "E"): -6,
+    ("W", "A"): -3, ("W", "M"): -2, ("H", "S"): -2, ("W", "I"): -2,
+    ("S", "C"): -4, ("L", "A"): -2, ("L", "E"): -4, ("W", "Q"): -5,
+    ("H", "G"): -3, ("Q", "N"): 0, ("H", "C"): -6, ("L", "M"): 3,
+    ("W", "Y"): 2, ("Y", "N"): -1, ("E", "P"): -1, ("Y", "F"): 3,
+    ("E", "T"): 0, ("A", "A"): 4, ("I", "N"): -3, ("G", "A"): 0,
+    ("Y", "V"): -1, ("E", "D"): 2, ("W", "H"): -3, ("Y", "R"): -1,
+    ("M", "Q"): 1, ("P", "S"): -1, ("R", "H"): 0, ("A", "C"): -2,
+    ("R", "D"): -2, ("K", "P"): -1, ("L", "D"): -6, ("K", "T"): 0,
+    ("V", "N"): -4, ("M", "A"): 0, ("K", "H"): 0, ("V", "R"): -3,
+    ("P", "C"): -8, ("M", "E"): -2, ("A", "S"): 0, ("T", "T"): 5,
+    ("R", "T"): -1, ("I", "G"): -5, ("R", "P"): -2, ("K", "D"): -1,
+    ("I", "C"): -4, ("F", "R"): -4, ("F", "V"): -1, ("L", "C"): -6,
+    ("F", "F"): 7, ("D", "A"): -1, ("F", "N"): -3, ("W", "D"): -6,
+    ("L", "P"): -3, ("Q", "S"): -1, ("N", "C"): -6, ("N", "G"): -1,
+    ("H", "N"): 2, ("W", "T"): -5, ("Q", "G"): -2, ("W", "P"): -4,
+    ("Q", "C"): -3, ("N", "S"): 0, ("L", "H"): -3, ("L", "L"): 5,
+    ("G", "T"): -3, ("M", "M"): 8, ("G", "P"): -2, ("Y", "K"): -2,
+    ("Y", "G"): -3, ("Y", "C"): -6, ("E", "A"): 0, ("E", "E"): 5,
+    ("Y", "S"): -2, ("M", "P"): -6, ("V", "C"): -4, ("M", "T"): -2,
+    ("V", "G"): -4, ("R", "E"): 0, ("V", "K"): -3, ("K", "Q"): 1,
+    ("R", "A"): -1, ("I", "R"): -3, ("N", "A"): -1, ("V", "S"): -3,
+    ("M", "D"): -4, ("M", "H"): -2, ("K", "A"): -1, ("R", "Q"): 1,
+    ("K", "E"): 1, ("F", "S"): -3, ("I", "K"): -3, ("D", "P"): -1,
+    ("D", "T"): -1, ("I", "M"): 1, ("F", "C"): -2, ("W", "L"): -1,
+    ("F", "G"): -6, ("F", "K"): -3, ("F", "T"): -3, ("D", "D"): 6,
+    ("Q", "T"): 0, ("W", "G"): -4, ("Q", "P"): -2, ("W", "C"): -6,
+    ("W", "K"): -3, ("H", "Q"): 0, ("Q", "D"): 0, ("W", "W"): 10,
+    ("V", "L"): 1, ("L", "G"): -5, ("W", "S"): -5, ("L", "K"): -2,
+    ("N", "P"): -2, ("H", "E"): -2, ("N", "T"): 0, ("H", "A"): -2,
+    ("Y", "L"): -2, ("Y", "H"): 0, ("G", "S"): -1, ("Y", "D"): -3,
+    ("V", "Q"): -2, ("L", "T"): -3, ("G", "G"): 5, ("G", "C"): -6,
+    ("E", "N"): 0, ("Y", "T"): -2, ("Y", "P"): -6, ("R", "N"): -1,
+    ("V", "D"): -4, ("K", "R"): 2, ("V", "H"): -2, ("I", "Q"): -5,
+    ("V", "P"): -4, ("M", "C"): -5, ("K", "N"): 0, ("V", "T"): -1,
+    ("M", "G"): -4, ("T", "S"): 1, ("I", "E"): -3, ("M", "K"): -1,
+    ("I", "A"): -2, ("N", "N"): 5, ("R", "R"): 7, ("F", "P"): -5,
+    ("L", "I"): 2, ("I", "S"): -3, ("D", "S"): 0, ("L", "S"): -4,
+    ("I", "H"): -5, ("F", "D"): -5, ("D", "C"): -7, ("F", "H"): -2,
+    ("D", "G"): -1, ("F", "L"): 2
+}
diff --git a/code/lib/Bio/SubsMat/__init__.py b/code/lib/Bio/SubsMat/__init__.py
new file mode 100644
index 0000000..17c1955
--- /dev/null
+++ b/code/lib/Bio/SubsMat/__init__.py
@@ -0,0 +1,600 @@
+# Copyright 2000-2009 by Iddo Friedberg.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Iddo Friedberg idoerg@cc.huji.ac.il
+
+"""Substitution matrices, log odds matrices, and operations on them.
+
+General:
+--------
+
+This module provides a class and a few routines for generating
+substitution matrices, similar ot BLOSUM or PAM matrices, but based on
+user-provided data.
+The class used for these matrices is SeqMat
+
+Matrices are implemented as a dictionary. Each index contains a 2-tuple,
+which are the two residue/nucleotide types replaced. The value differs
+according to the matrix's purpose: e.g in a log-odds frequency matrix, the
+value would be log(Pij/(Pi*Pj)) where:
+Pij: frequency of substitution of letter (residue/nucleotide) i by j
+Pi, Pj: expected frequencies of i and j, respectively.
+
+Usage:
+------
+The following section is laid out in the order by which most people wish
+to generate a log-odds matrix. Of course, interim matrices can be
+generated and investigated. Most people just want a log-odds matrix,
+that's all.
+
+Generating an Accepted Replacement Matrix:
+------------------------------------------
+Initially, you should generate an accepted replacement matrix (ARM)
+from your data. The values in ARM are the _counted_ number of
+replacements according to your data. The data could be a set of pairs
+or multiple alignments. So for instance if Alanine was replaced by
+Cysteine 10 times, and Cysteine by Alanine 12 times, the corresponding
+ARM entries would be:
+['A','C']: 10,
+['C','A'] 12
+As order doesn't matter, user can already provide only one entry:
+['A','C']: 22
+A SeqMat instance may be initialized with either a full (first
+method of counting: 10, 12) or half (the latter method, 22) matrix. A
+Full protein alphabet matrix would be of the size 20x20 = 400. A Half
+matrix of that alphabet would be 20x20/2 + 20/2 = 210. That is because
+same-letter entries don't change. (The matrix diagonal). Given an
+alphabet size of N:
+Full matrix size:N*N
+Half matrix size: N(N+1)/2
+
+If you provide a full matrix, the constructor will create a half-matrix
+automatically.
+If you provide a half-matrix, make sure of a (low, high) sorted order in
+the keys: there should only be
+a ('A','C') not a ('C','A').
+
+Internal functions:
+
+Generating the observed frequency matrix (OFM):
+-----------------------------------------------
+Use: OFM = _build_obs_freq_mat(ARM)
+The OFM is generated from the ARM, only instead of replacement counts, it
+contains replacement frequencies.
+
+Generating an expected frequency matrix (EFM):
+----------------------------------------------
+Use: EFM = _build_exp_freq_mat(OFM,exp_freq_table)
+exp_freq_table: should be a freqTableC instantiation. See freqTable.py for
+detailed information. Briefly, the expected frequency table has the
+frequencies of appearance for each member of the alphabet
+
+Generating a substitution frequency matrix (SFM):
+-------------------------------------------------
+Use: SFM = _build_subs_mat(OFM,EFM)
+Accepts an OFM, EFM. Provides the division product of the corresponding
+values.
+
+Generating a log-odds matrix (LOM):
+-----------------------------------
+Use: LOM=_build_log_odds_mat(SFM[,logbase=10,factor=10.0,roundit=1])
+Accepts an SFM. logbase: base of the logarithm used to generate the
+log-odds values. factor: factor used to multiply the log-odds values.
+roundit: default - true. Whether to round the values.
+Each entry is generated by log(LOM[key])*factor
+And rounded if required.
+
+External:
+---------
+In most cases, users will want to generate a log-odds matrix only, without
+explicitly calling the OFM --> EFM --> SFM stages. The function
+build_log_odds_matrix does that. User provides an ARM and an expected
+frequency table. The function returns the log-odds matrix.
+
+Methods for subtraction, addition and multiplication of matrices:
+-----------------------------------------------------------------
+
+* Generation of an expected frequency table from an observed frequency
+  matrix.
+* Calculation of linear correlation coefficient between two matrices.
+* Calculation of relative entropy is now done using the
+  _make_relative_entropy method and is stored in the member
+  self.relative_entropy
+* Calculation of entropy is now done using the _make_entropy method and
+  is stored in the member self.entropy.
+* Jensen-Shannon distance between the distributions from which the
+  matrices are derived. This is a distance function based on the
+  distribution's entropies.
+
+"""
+
+import re
+import sys
+import copy
+import math
+
+# BioPython imports
+from Bio.SubsMat import FreqTable
+from Bio import BiopythonDeprecationWarning
+
+import warnings
+
+warnings.warn(
+    "Bio.SubsMat has been deprecated, and we intend to remove it in a future "
+    "release of Biopython. As an alternative, please consider using "
+    "Bio.Align.substitution_matrices as a replacement, and contact the "
+    "Biopython developers if you still need the Bio.SubsMat module.",
+    BiopythonDeprecationWarning,
+)
+
+
+log = math.log
+# Matrix types
+NOTYPE = 0
+ACCREP = 1
+OBSFREQ = 2
+SUBS = 3
+EXPFREQ = 4
+LO = 5
+EPSILON = 0.00000000000001
+
+
+class SeqMat(dict):
+    """A Generic sequence matrix class.
+
+    The key is a 2-tuple containing the letter indices of the matrix. Those
+    should be sorted in the tuple (low, high). Because each matrix is dealt
+    with as a half-matrix.
+    """
+
+    def _alphabet_from_matrix(self):
+        """Set alphabet letters from the matrix entries (PRIVATE)."""
+        ab_set = set()
+        for i in self:
+            ab_set.add(i[0])
+            ab_set.add(i[1])
+        self.alphabet = "".join(sorted(ab_set))
+
+    def __init__(self, data=None, alphabet=None, mat_name="", build_later=0):
+        """Initialize.
+
+        User may supply:
+
+        - data: matrix itself
+        - mat_name: its name. See below.
+        - alphabet: an iterable over the letters allowed as indices into the
+          matrix. If not supplied, constructor builds its own from that matrix.
+        - build_later: skip the matrix size assertion. User will build the
+          matrix after creating the instance. Constructor builds a half matrix
+          filled with zeroes.
+
+        """
+        assert isinstance(mat_name, str)
+        # "data" may be:
+        # 1) None --> then self.data is an empty dictionary
+        # 2) type({}) --> then self takes the items in data
+        # 3) An instance of SeqMat
+        # This whole creation-during-execution is done to avoid changing
+        # default values, the way Python does because default values are
+        # created when the function is defined, not when it is created.
+        if data:
+            try:
+                self.update(data)
+            except ValueError:
+                raise ValueError("Failed to store data in a dictionary")
+
+        # If passed alphabet is empty, use the letters in the matrix itself
+        if alphabet is None:
+            self._alphabet_from_matrix()
+        else:
+            self.alphabet = "".join(alphabet)
+        # Assert matrix size: half or full
+        if not build_later:
+            N = len(self.alphabet)
+            assert len(self) == N ** 2 or len(self) == N * (N + 1) / 2
+        self.ab_list = list(self.alphabet)
+        self.ab_list.sort()
+        # Names: a string like "BLOSUM62" or "PAM250"
+        self.mat_name = mat_name
+        if build_later:
+            self._init_zero()
+        else:
+            # Convert full to half
+            self._full_to_half()
+            self._correct_matrix()
+        self.sum_letters = {}
+        self.relative_entropy = 0
+
+    def _correct_matrix(self):
+        """Sort key tuples (PRIVATE)."""
+        for key in list(self):  # iterate over a copy
+            if key[0] > key[1]:
+                self[(key[1], key[0])] = self[key]
+                del self[key]
+
+    def _full_to_half(self):
+        """Convert a full-matrix to a half-matrix (PRIVATE)."""
+        # For instance: two entries ('A','C'):13 and ('C','A'):20 will be summed
+        # into ('A','C'): 33 and the index ('C','A') will be deleted
+        # ('A','A') and ('C','C') will remain the same.
+
+        N = len(self.alphabet)
+        # Do nothing if this is already a half-matrix
+        if len(self) == N * (N + 1) / 2:
+            return
+        for i in self.ab_list:
+            for j in self.ab_list[: self.ab_list.index(i) + 1]:
+                if i != j:
+                    self[j, i] = self[j, i] + self[i, j]
+                    del self[i, j]
+
+    def _init_zero(self):
+        """Initialize the ab_list values to zero (PRIVATE)."""
+        for i in self.ab_list:
+            for j in self.ab_list[: self.ab_list.index(i) + 1]:
+                self[j, i] = 0.0
+
+    def make_entropy(self):
+        """Calculate and set the entropy attribute."""
+        self.entropy = 0
+        for i in self:
+            if self[i] > EPSILON:
+                self.entropy += self[i] * log(self[i]) / log(2)
+        self.entropy = -self.entropy
+
+    def sum(self):
+        """Return sum of the results."""
+        result = {}
+        for letter in self.alphabet:
+            result[letter] = 0.0
+        for pair, value in self.items():
+            i1, i2 = pair
+            if i1 == i2:
+                result[i1] += value
+            else:
+                result[i1] += value / 2
+                result[i2] += value / 2
+        return result
+
+    def format(
+        self, fmt="%4d", letterfmt="%4s", alphabet=None, non_sym=None, full=False
+    ):
+        """Create a string with the bottom-half (default) or a full matrix.
+
+        User may pass own alphabet, which should contain all letters in the
+        alphabet of the matrix, but may be in a different order. This
+        order will be the order of the letters on the axes.
+        """
+        if not alphabet:
+            alphabet = self.ab_list
+        lines = []
+        assert non_sym is None or isinstance(non_sym, float) or isinstance(non_sym, int)
+        letterline = ""
+        for i in alphabet:
+            letterline += letterfmt % i
+        if full:
+            lines.append(letterline)
+        for i in alphabet:
+            line = i
+            flag = False
+            for j in alphabet:
+                if flag:
+                    val = non_sym
+                else:
+                    try:
+                        val = self[i, j]
+                    except KeyError:
+                        val = self[j, i]
+                if val <= -999:
+                    cur_str = "  ND"
+                else:
+                    cur_str = fmt % val
+                line += cur_str
+                if j == i:
+                    if not full:
+                        break
+                    if non_sym is not None:
+                        flag = True
+            lines.append(line)
+        if not full:
+            lines.append(letterline)
+        return "\n".join(lines)
+
+    def __str__(self):
+        """Print a nice half-matrix."""
+        return self.format()
+
+    def __sub__(self, other):
+        """Return integer subtraction product of the two matrices."""
+        mat_diff = 0
+        for i in self:
+            mat_diff += self[i] - other[i]
+        return mat_diff
+
+    def __mul__(self, other):
+        """Element-wise matrix multiplication.
+
+        Returns a new matrix created by multiplying each element by other (if
+        other is scalar), or by performing element-wise multiplication of the
+        two matrices (if other is a matrix of the same size).
+        """
+        new_mat = copy.copy(self)
+        try:  # first try and see if other is a matrix
+            for i in self:
+                new_mat[i] *= other[i]
+        except TypeError:  # other is a scalar value
+            for i in self:
+                new_mat[i] *= other
+        return new_mat
+
+    def __rmul__(self, other):
+        """Element-wise matrix multiplication.
+
+        Returns a new matrix created by multiplying each element by other (if
+        other is scalar), or by performing element-wise multiplication of the
+        two matrices (if other is a matrix of the same size).
+        """
+        return self.__mul__(other)
+
+    def __add__(self, other):
+        """Matrix addition."""
+        new_mat = copy.copy(self)
+        for i in self:
+            new_mat[i] += other[i]
+        return new_mat
+
+
+class SubstitutionMatrix(SeqMat):
+    """Substitution matrix."""
+
+    def calculate_relative_entropy(self, obs_freq_mat):
+        """Calculate and return relative entropy w.r.t. observed frequency matrix."""
+        relative_entropy = 0.0
+        for key, value in self.items():
+            if value > EPSILON:
+                relative_entropy += obs_freq_mat[key] * log(value)
+        relative_entropy /= log(2)
+        return relative_entropy
+
+
+class LogOddsMatrix(SeqMat):
+    """Log odds matrix."""
+
+    def calculate_relative_entropy(self, obs_freq_mat):
+        """Calculate and return relative entropy w.r.t. observed frequency matrix."""
+        relative_entropy = 0.0
+        for key, value in self.items():
+            relative_entropy += obs_freq_mat[key] * value / log(2)
+        return relative_entropy
+
+
+def _build_obs_freq_mat(acc_rep_mat):
+    """Build observed frequency matrix (PRIVATE).
+
+    Build the observed frequency matrix. from an accepted replacements matrix.
+    The acc_rep_mat matrix should be generated by the user.
+    """
+    # Note: acc_rep_mat should already be a half_matrix!!
+    total = float(sum(acc_rep_mat.values()))
+    obs_freq_mat = SeqMat(alphabet=acc_rep_mat.alphabet, build_later=1)
+    for i in acc_rep_mat:
+        obs_freq_mat[i] = acc_rep_mat[i] / total
+    return obs_freq_mat
+
+
+def _exp_freq_table_from_obs_freq(obs_freq_mat):
+    """Build expected frequence table from observed frequences (PRIVATE)."""
+    exp_freq_table = {}
+    for i in obs_freq_mat.alphabet:
+        exp_freq_table[i] = 0.0
+    for i in obs_freq_mat:
+        if i[0] == i[1]:
+            exp_freq_table[i[0]] += obs_freq_mat[i]
+        else:
+            exp_freq_table[i[0]] += obs_freq_mat[i] / 2.0
+            exp_freq_table[i[1]] += obs_freq_mat[i] / 2.0
+    return FreqTable.FreqTable(exp_freq_table, FreqTable.FREQ)
+
+
+def _build_exp_freq_mat(exp_freq_table):
+    """Build an expected frequency matrix (PRIVATE).
+
+    exp_freq_table: should be a FreqTable instance
+    """
+    exp_freq_mat = SeqMat(alphabet=exp_freq_table.alphabet, build_later=1)
+    for i in exp_freq_mat:
+        if i[0] == i[1]:
+            exp_freq_mat[i] = exp_freq_table[i[0]] ** 2
+        else:
+            exp_freq_mat[i] = 2.0 * exp_freq_table[i[0]] * exp_freq_table[i[1]]
+    return exp_freq_mat
+
+
+#
+# Build the substitution matrix
+#
+def _build_subs_mat(obs_freq_mat, exp_freq_mat):
+    """Build the substitution matrix (PRIVATE)."""
+    if obs_freq_mat.ab_list != exp_freq_mat.ab_list:
+        raise ValueError("Alphabet mismatch in passed matrices")
+    subs_mat = SubstitutionMatrix(obs_freq_mat)
+    for i in obs_freq_mat:
+        subs_mat[i] = obs_freq_mat[i] / exp_freq_mat[i]
+    return subs_mat
+
+
+#
+# Build a log-odds matrix
+#
+def _build_log_odds_mat(subs_mat, logbase=2, factor=10.0, round_digit=0, keep_nd=0):
+    """Build a log-odds matrix (PRIVATE).
+
+    - logbase=2: base of logarithm used to build (default 2)
+    - factor=10.: a factor by which each matrix entry is multiplied
+    - round_digit: roundoff place after decimal point
+    - keep_nd: if true, keeps the -999 value for non-determined values (for which
+      there are no substitutions in the frequency substitutions matrix). If false,
+      plants the minimum log-odds value of the matrix in entries containing -999.
+
+    """
+    lo_mat = LogOddsMatrix(subs_mat)
+    for key, value in subs_mat.items():
+        if value < EPSILON:
+            lo_mat[key] = -999
+        else:
+            lo_mat[key] = round(factor * log(value) / log(logbase), round_digit)
+    mat_min = min(lo_mat.values())
+    if not keep_nd:
+        for i in lo_mat:
+            if lo_mat[i] <= -999:
+                lo_mat[i] = mat_min
+    return lo_mat
+
+
+#
+# External function. User provides an accepted replacement matrix, and,
+# optionally the following: expected frequency table, log base, mult. factor,
+# and rounding factor. Generates a log-odds matrix, calling internal SubsMat
+# functions.
+#
+def make_log_odds_matrix(
+    acc_rep_mat, exp_freq_table=None, logbase=2, factor=1.0, round_digit=9, keep_nd=0
+):
+    """Make log-odds matrix."""
+    obs_freq_mat = _build_obs_freq_mat(acc_rep_mat)
+    if not exp_freq_table:
+        exp_freq_table = _exp_freq_table_from_obs_freq(obs_freq_mat)
+    exp_freq_mat = _build_exp_freq_mat(exp_freq_table)
+    subs_mat = _build_subs_mat(obs_freq_mat, exp_freq_mat)
+    lo_mat = _build_log_odds_mat(subs_mat, logbase, factor, round_digit, keep_nd)
+    return lo_mat
+
+
+def observed_frequency_to_substitution_matrix(obs_freq_mat):
+    """Convert observed frequency table into substitution matrix."""
+    exp_freq_table = _exp_freq_table_from_obs_freq(obs_freq_mat)
+    exp_freq_mat = _build_exp_freq_mat(exp_freq_table)
+    subs_mat = _build_subs_mat(obs_freq_mat, exp_freq_mat)
+    return subs_mat
+
+
+def read_text_matrix(data_file):
+    """Read a matrix from a text file."""
+    matrix = {}
+    tmp = data_file.read().split("\n")
+    table = []
+    for i in tmp:
+        table.append(i.split())
+    # remove records beginning with ``#''
+    for rec in table[:]:
+        if rec.count("#") > 0:
+            table.remove(rec)
+
+    # remove null lists
+    while table.count([]) > 0:
+        table.remove([])
+    # build a dictionary
+    alphabet = table[0]
+    j = 0
+    for rec in table[1:]:
+        # print(j)
+        row = alphabet[j]
+        # row = rec[0]
+        if re.compile(r"[A-z\*]").match(rec[0]):
+            first_col = 1
+        else:
+            first_col = 0
+        i = 0
+        for field in rec[first_col:]:
+            col = alphabet[i]
+            matrix[(row, col)] = float(field)
+            i += 1
+        j += 1
+    # delete entries with an asterisk
+    for i in matrix:
+        if "*" in i:
+            del matrix[i]
+    ret_mat = SeqMat(matrix)
+    return ret_mat
+
+
+diagNO = 1
+diagONLY = 2
+diagALL = 3
+
+
+def two_mat_relative_entropy(mat_1, mat_2, logbase=2, diag=diagALL):
+    """Return relative entropy of two matrices."""
+    rel_ent = 0.0
+    key_list_1 = sorted(mat_1)
+    key_list_2 = sorted(mat_2)
+    key_list = []
+    sum_ent_1 = 0.0
+    sum_ent_2 = 0.0
+    for i in key_list_1:
+        if i in key_list_2:
+            key_list.append(i)
+    if len(key_list_1) != len(key_list_2):
+        sys.stderr.write("Warning: first matrix has more entries than the second\n")
+    if key_list_1 != key_list_2:
+        sys.stderr.write("Warning: indices not the same between matrices\n")
+    for key in key_list:
+        if diag == diagNO and key[0] == key[1]:
+            continue
+        if diag == diagONLY and key[0] != key[1]:
+            continue
+        if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
+            sum_ent_1 += mat_1[key]
+            sum_ent_2 += mat_2[key]
+
+    for key in key_list:
+        if diag == diagNO and key[0] == key[1]:
+            continue
+        if diag == diagONLY and key[0] != key[1]:
+            continue
+        if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
+            val_1 = mat_1[key] / sum_ent_1
+            val_2 = mat_2[key] / sum_ent_2
+            rel_ent += val_1 * log(val_1 / val_2) / log(logbase)
+    return rel_ent
+
+
+def two_mat_correlation(mat_1, mat_2):
+    """Return linear correlation coefficient between two matrices."""
+    try:
+        import numpy
+    except ImportError:
+        raise ImportError(
+            "Please install Numerical Python (numpy) if you want to use this function"
+        )
+    values = []
+    assert mat_1.ab_list == mat_2.ab_list
+    for ab_pair in mat_1:
+        try:
+            values.append((mat_1[ab_pair], mat_2[ab_pair]))
+        except KeyError:
+            raise ValueError("%s is not a common key" % ab_pair)
+    correlation_matrix = numpy.corrcoef(values, rowvar=0)
+    correlation = correlation_matrix[0, 1]
+    return correlation
+
+
+def two_mat_DJS(mat_1, mat_2, pi_1=0.5, pi_2=0.5):
+    """Return Jensen-Shannon Distance between two observed frequence matrices."""
+    assert mat_1.ab_list == mat_2.ab_list
+    assert pi_1 > 0 and pi_2 > 0 and pi_1 < 1 and pi_2 < 1
+    assert not (pi_1 + pi_2 - 1.0 > EPSILON)
+    sum_mat = SeqMat(build_later=1)
+    sum_mat.ab_list = mat_1.ab_list
+    for i in mat_1:
+        sum_mat[i] = pi_1 * mat_1[i] + pi_2 * mat_2[i]
+    sum_mat.make_entropy()
+    mat_1.make_entropy()
+    mat_2.make_entropy()
+    # print(mat_1.entropy, mat_2.entropy)
+    dJS = sum_mat.entropy - pi_1 * mat_1.entropy - pi_2 * mat_2.entropy
+    return dJS
diff --git a/code/lib/Bio/SubsMat/__pycache__/FreqTable.cpython-37.pyc b/code/lib/Bio/SubsMat/__pycache__/FreqTable.cpython-37.pyc
new file mode 100644
index 0000000..3452a68
Binary files /dev/null and b/code/lib/Bio/SubsMat/__pycache__/FreqTable.cpython-37.pyc differ
diff --git a/code/lib/Bio/SubsMat/__pycache__/MatrixInfo.cpython-37.pyc b/code/lib/Bio/SubsMat/__pycache__/MatrixInfo.cpython-37.pyc
new file mode 100644
index 0000000..0571cff
Binary files /dev/null and b/code/lib/Bio/SubsMat/__pycache__/MatrixInfo.cpython-37.pyc differ
diff --git a/code/lib/Bio/SubsMat/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SubsMat/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..a992921
Binary files /dev/null and b/code/lib/Bio/SubsMat/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/SwissProt/KeyWList.py b/code/lib/Bio/SwissProt/KeyWList.py
new file mode 100644
index 0000000..fa27d9a
--- /dev/null
+++ b/code/lib/Bio/SwissProt/KeyWList.py
@@ -0,0 +1,91 @@
+# Copyright 1999 by Jeffrey Chang.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code to parse the keywlist.txt file from SwissProt/UniProt.
+
+See:
+ - https://www.uniprot.org/docs/keywlist
+ - https://www.uniprot.org/docs/keywlist.txt
+
+Classes:
+ - Record            Stores the information about one keyword or one category
+   in the keywlist.txt file.
+
+Functions:
+ - parse             Parses the keywlist.txt file and returns an iterator to
+   the records it contains.
+
+"""
+
+
+class Record(dict):
+    """Store information of one keyword or category from the keywords list.
+
+    This record stores the information of one keyword or category in the
+    keywlist.txt as a Python dictionary. The keys in this dictionary are
+    the line codes that can appear in the keywlist.txt file::
+
+        ---------  ---------------------------     ----------------------
+        Line code  Content                         Occurrence in an entry
+        ---------  ---------------------------     ----------------------
+        ID         Identifier (keyword)            Once; starts a keyword entry
+        IC         Identifier (category)           Once; starts a category entry
+        AC         Accession (KW-xxxx)             Once
+        DE         Definition                      Once or more
+        SY         Synonyms                        Optional; once or more
+        GO         Gene ontology (GO) mapping      Optional; once or more
+        HI         Hierarchy                       Optional; once or more
+        WW         Relevant WWW site               Optional; once or more
+        CA         Category                        Once per keyword entry; absent
+                                                   in category entries
+
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        dict.__init__(self)
+        for keyword in ("DE", "SY", "GO", "HI", "WW"):
+            self[keyword] = []
+
+
+def parse(handle):
+    """Parse the keyword list from file handle.
+
+    Returns a generator object which yields keyword entries as
+    Bio.SwissProt.KeyWList.Record() object.
+    """
+    record = Record()
+    # First, skip the header - look for start of a record
+    for line in handle:
+        if line.startswith("ID   "):
+            # Looks like there was no header
+            record["ID"] = line[5:].strip()
+            break
+        if line.startswith("IC   "):
+            # Looks like there was no header
+            record["IC"] = line[5:].strip()
+            break
+    # Now parse the records
+    for line in handle:
+        if line.startswith("-------------------------------------"):
+            # We have reached the footer
+            break
+        key = line[:2]
+        if key == "//":
+            record["DE"] = " ".join(record["DE"])
+            record["SY"] = " ".join(record["SY"])
+            yield record
+            record = Record()
+        elif line[2:5] == "   ":
+            value = line[5:].strip()
+            if key in ("ID", "IC", "AC", "CA"):
+                record[key] = value
+            elif key in ("DE", "SY", "GO", "HI", "WW"):
+                record[key].append(value)
+            else:
+                print("Ignoring: %s" % line.strip())
+    # Read the footer and throw it away
+    for line in handle:
+        pass
diff --git a/code/lib/Bio/SwissProt/__init__.py b/code/lib/Bio/SwissProt/__init__.py
new file mode 100644
index 0000000..a311ce0
--- /dev/null
+++ b/code/lib/Bio/SwissProt/__init__.py
@@ -0,0 +1,861 @@
+# Copyright 2007 by Michiel de Hoon.  All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+"""Code to work with the sprotXX.dat file from SwissProt.
+
+https://web.expasy.org/docs/userman.html
+
+Classes:
+ - Record             Holds SwissProt data.
+ - Reference          Holds reference data from a SwissProt record.
+
+Functions:
+ - read               Read one SwissProt record
+ - parse              Read multiple SwissProt records
+
+"""
+
+
+import io
+
+from Bio.SeqFeature import (
+    SeqFeature,
+    FeatureLocation,
+    ExactPosition,
+    BeforePosition,
+    AfterPosition,
+    UncertainPosition,
+    UnknownPosition,
+)
+
+
+class SwissProtParserError(ValueError):
+    """An error occurred while parsing a SwissProt file."""
+
+    def __init__(self, *args, line=None):
+        """Create a SwissProtParserError object with the offending line."""
+        super().__init__(*args)
+        self.line = line
+
+
+class Record:
+    """Holds information from a SwissProt record.
+
+    Attributes:
+     - entry_name        Name of this entry, e.g. RL1_ECOLI.
+     - data_class        Either 'STANDARD' or 'PRELIMINARY'.
+     - molecule_type     Type of molecule, 'PRT',
+     - sequence_length   Number of residues.
+     - accessions        List of the accession numbers, e.g. ['P00321']
+     - created           A tuple of (date, release).
+     - sequence_update   A tuple of (date, release).
+     - annotation_update A tuple of (date, release).
+     - description       Free-format description.
+     - gene_name         Gene name.  See userman.txt for description.
+     - organism          The source of the sequence.
+     - organelle         The origin of the sequence.
+     - organism_classification  The taxonomy classification.  List of strings.
+       (http://www.ncbi.nlm.nih.gov/Taxonomy/)
+     - taxonomy_id       A list of NCBI taxonomy id's.
+     - host_organism     A list of names of the hosts of a virus, if any.
+     - host_taxonomy_id  A list of NCBI taxonomy id's of the hosts, if any.
+     - references        List of Reference objects.
+     - comments          List of strings.
+     - cross_references  List of tuples (db, id1[, id2][, id3]).  See the docs.
+     - keywords          List of the keywords.
+     - features          List of tuples (key name, from, to, description).
+       from and to can be either integers for the residue
+       numbers, '<', '>', or '?'
+     - protein_existence Numerical value describing the evidence for the existence of the protein.
+     - seqinfo           tuple of (length, molecular weight, CRC32 value)
+     - sequence          The sequence.
+
+    Examples
+    --------
+    >>> from Bio import SwissProt
+    >>> example_filename = "SwissProt/sp008"
+    >>> with open(example_filename) as handle:
+    ...     records = SwissProt.parse(handle)
+    ...     for record in records:
+    ...         print(record.entry_name)
+    ...         print(",".join(record.accessions))
+    ...         print(record.keywords)
+    ...         print(repr(record.organism))
+    ...         print(record.sequence[:20] + "...")
+    ...
+    1A02_HUMAN
+    P01892,P06338,P30514,P30444,P30445,P30446,Q29680,Q29899,Q95352,Q29837,Q95380
+    ['MHC I', 'Transmembrane', 'Glycoprotein', 'Signal', 'Polymorphism', '3D-structure']
+    'Homo sapiens (Human).'
+    MAVMAPRTLVLLLSGALALT...
+
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.entry_name = None
+        self.data_class = None
+        self.molecule_type = None
+        self.sequence_length = None
+
+        self.accessions = []
+        self.created = None
+        self.sequence_update = None
+        self.annotation_update = None
+
+        self.description = []
+        self.gene_name = ""
+        self.organism = []
+        self.organelle = ""
+        self.organism_classification = []
+        self.taxonomy_id = []
+        self.host_organism = []
+        self.host_taxonomy_id = []
+        self.references = []
+        self.comments = []
+        self.cross_references = []
+        self.keywords = []
+        self.features = []
+        self.protein_existence = ""
+
+        self.seqinfo = None
+        self.sequence = ""
+
+
+class Reference:
+    """Holds information from one reference in a SwissProt entry.
+
+    Attributes:
+     - number      Number of reference in an entry.
+     - evidence    Evidence code.  List of strings.
+     - positions   Describes extent of work.  List of strings.
+     - comments    Comments.  List of (token, text).
+     - references  References.  List of (dbname, identifier).
+     - authors     The authors of the work.
+     - title       Title of the work.
+     - location    A citation for the work.
+
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.number = None
+        self.positions = []
+        self.comments = []
+        self.references = []
+        self.authors = []
+        self.title = []
+        self.location = []
+
+
+class FeatureTable(SeqFeature):
+    """Stores feature annotations for specific regions of the sequence.
+
+    This is a subclass of SeqFeature, defined in Bio.SeqFeature, where the
+    attributes are used as follows:
+
+     - ``location``: location of the feature on the canonical or isoform
+       sequence; the location is stored as an instance of FeatureLocation,
+       defined in Bio.SeqFeature, with the ref attribute set to the isoform
+       ID referring to the canonical or isoform sequence on which the feature
+       is defined
+     - ``id``: unique and stable identifier (FTId), only provided for features
+       belonging to the types CARBOHYD, CHAIN, PEPTIDE, PROPEP, VARIANT, or
+       VAR_SEQ
+     - ``type``: indicates the type of feature, as defined by the UniProt
+       Knowledgebase documentation:
+
+        - ACT_SITE: amino acid(s) involved in the activity of an enzyme
+        - BINDING:  binding site for any chemical group
+        - CARBOHYD: glycosylation site; an FTId identifier to the GlyConnect
+          database is provided if annotated there
+        - CA_BIND:  calcium-binding region
+        - CHAIN:    polypeptide chain in the mature protein
+        - COILED:   coiled-coil region
+        - COMPBIAS: compositionally biased region
+        - CONFLICT: different sources report differing sequences
+        - CROSSLNK: posttransationally formed amino acid bond
+        - DISULFID: disulfide bond
+        - DNA_BIND: DNA-binding region
+        - DOMAIN:   domain, defined as a specific combination of secondary
+          structures organized into a characteristic three-dimensional
+          structure or fold
+        - INIT_MET: initiator methionine
+        - INTRAMEM: region located in a membrane without crossing it
+        - HELIX:    alpha-, 3(10)-, or pi-helix secondary structure
+        - LIPID:    covalent binding of a lipid moiety
+        - METAL:    binding site for a metal ion
+        - MOD_RES:  posttranslational modification (PTM) of a residue,
+          annotated by the controlled vocabulary defined by the ptmlist.txt
+          document on the UniProt website
+        - MOTIF:    short sequence motif of biological interest
+        - MUTAGEN:  site experimentally altered by mutagenesis
+        - NON_CONS: non-consecutive residues
+        - NON_STD:  non-standard amino acid
+        - NON_TER:  the residue at an extremity of the sequence is not the
+          terminal residue
+        - NP_BIND:  nucleotide phosphate-binding region
+        - PEPTIDE:  released active mature polypeptide
+        - PROPEP:   any processed propeptide
+        - REGION:   region of interest in the sequence
+        - REPEAT:   internal sequence repetition
+        - SIGNAL:   signal sequence (prepeptide)
+        - SITE:     amino-acid site of interest not represented by another
+          feature key
+        - STRAND:   beta-strand secondary structure; either a hydrogen-bonded
+          extended beta strand or a residue in an isolated beta-bridge
+        - TOPO_DOM: topological domain
+        - TRANSIT:  transit peptide (mitochondrion, chloroplast, thylakoid,
+          cyanelle, peroxisome, etc.)
+        - TRANSMEM: transmembrane region
+        - TURN:     H-bonded turn (3-, 4-, or 5-turn)
+        - UNSURE:   uncertainties in the sequence
+        - VARIANT:  sequence variant; an FTId is provided for protein sequence
+          variants of Hominidae (great apes and humans)
+        - VAR_SEQ:  sequence variant produced by alternative splicing,
+          alternative promoter usage, alternative initiation, or ribosomal
+          frameshifting
+        - ZN_FING:  zinc finger region
+
+     - qualifiers   A dictionary of additional information, which may include
+       the feature evidence and free-text notes. While SwissProt includes the
+       feature identifier code (FTId) as a qualifier, it is stored as the
+       attribute ID of the FeatureTable object.
+
+    """
+
+
+def parse(source):
+    """Read multiple SwissProt records from file.
+
+    Argument source is a file-like object or a path to a file.
+
+    Returns a generator object which yields Bio.SwissProt.Record() objects.
+    """
+    handle = _open(source)
+    try:
+        while True:
+            record = _read(handle)
+            if not record:
+                return
+            yield record
+    finally:
+        if handle is not source:
+            handle.close()
+
+
+def read(source):
+    """Read one SwissProt record from file.
+
+    Argument source is a file-like object or a path to a file.
+
+    Returns a Record() object.
+    """
+    handle = _open(source)
+    try:
+        record = _read(handle)
+        if not record:
+            raise ValueError("No SwissProt record found")
+        # We should have reached the end of the record by now.
+        # Try to read one more line to be sure:
+        try:
+            next(handle)
+        except StopIteration:
+            return record
+        raise ValueError("More than one SwissProt record found")
+    finally:
+        if handle is not source:
+            handle.close()
+
+
+# Everything below is considered private
+
+
+def _open(source):
+    try:
+        handle = open(source)
+        return handle
+    except TypeError:
+        handle = source
+        if handle.read(0) == "":
+            # handle is text; assume the encoding is compatible with ASCII
+            return handle
+        # handle is binary; SwissProt encoding is always ASCII
+        return io.TextIOWrapper(handle, encoding="ASCII")
+
+
+def _read(handle):
+    record = None
+    unread = ""
+    try:
+        line = next(handle)
+    except StopIteration:
+        return record
+    key, value = line[:2], line[5:].rstrip()
+    if key != "ID":
+        raise SwissProtParserError("Failed to find ID in first line", line=line)
+    record = Record()
+    _read_id(record, line)
+    _sequence_lines = []
+    for line in handle:
+        key, value = line[:2], line[5:].rstrip()
+        if unread:
+            value = unread + " " + value
+            unread = ""
+        if key == "AC":
+            accessions = value.rstrip(";").split("; ")
+            record.accessions.extend(accessions)
+        elif key == "DT":
+            _read_dt(record, line)
+        elif key == "DE":
+            record.description.append(value.strip())
+        elif key == "GN":
+            if record.gene_name:
+                record.gene_name += " "
+            record.gene_name += value
+        elif key == "OS":
+            record.organism.append(value)
+        elif key == "OG":
+            record.organelle += line[5:]
+        elif key == "OC":
+            cols = value.rstrip(";.").split("; ")
+            record.organism_classification.extend(cols)
+        elif key == "OX":
+            _read_ox(record, line)
+        elif key == "OH":
+            _read_oh(record, line)
+        elif key == "RN":
+            reference = Reference()
+            _read_rn(reference, value)
+            record.references.append(reference)
+        elif key == "RP":
+            assert record.references, "RP: missing RN"
+            record.references[-1].positions.append(value)
+        elif key == "RC":
+            assert record.references, "RC: missing RN"
+            reference = record.references[-1]
+            unread = _read_rc(reference, value)
+        elif key == "RX":
+            assert record.references, "RX: missing RN"
+            reference = record.references[-1]
+            _read_rx(reference, value)
+        elif key == "RL":
+            assert record.references, "RL: missing RN"
+            reference = record.references[-1]
+            reference.location.append(value)
+        # In UniProt release 1.12 of 6/21/04, there is a new RG
+        # (Reference Group) line, which references a group instead of
+        # an author.  Each block must have at least 1 RA or RG line.
+        elif key == "RA":
+            assert record.references, "RA: missing RN"
+            reference = record.references[-1]
+            reference.authors.append(value)
+        elif key == "RG":
+            assert record.references, "RG: missing RN"
+            reference = record.references[-1]
+            reference.authors.append(value)
+        elif key == "RT":
+            assert record.references, "RT: missing RN"
+            reference = record.references[-1]
+            reference.title.append(value)
+        elif key == "CC":
+            _read_cc(record, line)
+        elif key == "DR":
+            _read_dr(record, value)
+        elif key == "PE":
+            _read_pe(record, value)
+        elif key == "KW":
+            _read_kw(record, value)
+        elif key == "FT":
+            _read_ft(record, line)
+        elif key == "SQ":
+            cols = value.split()
+            assert len(cols) == 7, "I don't understand SQ line %s" % line
+            # Do more checking here?
+            record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
+        elif key == "  ":
+            _sequence_lines.append(value.replace(" ", "").rstrip())
+        elif key == "//":
+            # Join multiline data into one string
+            record.description = " ".join(record.description)
+            record.organism = " ".join(record.organism)
+            record.organelle = record.organelle.rstrip()
+            for reference in record.references:
+                reference.authors = " ".join(reference.authors).rstrip(";")
+                if reference.title:
+                    title = reference.title[0]
+                    for fragment in reference.title[1:]:
+                        if not title.endswith("-"):
+                            title += " "
+                        title += fragment
+                    title = title.rstrip(";")
+                    if title.startswith('"') and title.endswith('"'):
+                        title = title[1:-1]  # remove quotes
+                else:
+                    title = ""
+                reference.title = title
+                reference.location = " ".join(reference.location)
+            record.sequence = "".join(_sequence_lines)
+            return record
+        elif key == "**":
+            # Do this one last, as it will almost never occur.
+            # See Bug 2353, some files from the EBI have extra lines
+            # starting "**" (two asterisks/stars).  They appear
+            # to be unofficial automated annotations. e.g.
+            # **
+            # **   #################    INTERNAL SECTION    ##################
+            # **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
+            pass
+        else:
+            raise SwissProtParserError("Unknown keyword '%s' found" % key, line=line)
+    if record:
+        raise ValueError("Unexpected end of stream.")
+
+
+def _read_id(record, line):
+    cols = line[5:].split()
+    # Prior to release 51, included with MoleculeType:
+    # ID   EntryName DataClass; MoleculeType; SequenceLength AA.
+    #
+    # Newer files lack the MoleculeType:
+    # ID   EntryName DataClass; SequenceLength AA.
+    if len(cols) == 5:
+        record.entry_name = cols[0]
+        record.data_class = cols[1].rstrip(";")
+        record.molecule_type = cols[2].rstrip(";")
+        record.sequence_length = int(cols[3])
+    elif len(cols) == 4:
+        record.entry_name = cols[0]
+        record.data_class = cols[1].rstrip(";")
+        record.molecule_type = None
+        record.sequence_length = int(cols[2])
+    else:
+        raise SwissProtParserError("ID line has unrecognised format", line=line)
+    # check if the data class is one of the allowed values
+    allowed = ("STANDARD", "PRELIMINARY", "IPI", "Reviewed", "Unreviewed")
+    if record.data_class not in allowed:
+        message = "Unrecognized data class '%s'" % record.data_class
+        raise SwissProtParserError(message, line=line)
+
+    # molecule_type should be 'PRT' for PRoTein
+    # Note that has been removed in recent releases (set to None)
+    if record.molecule_type not in (None, "PRT"):
+        message = "Unrecognized molecule type '%s'" % record.molecule_type
+        raise SwissProtParserError(message, line=line)
+
+
+def _read_dt(record, line):
+    value = line[5:]
+    uprline = value.upper()
+    cols = value.rstrip().split()
+    if (
+        "CREATED" in uprline
+        or "LAST SEQUENCE UPDATE" in uprline
+        or "LAST ANNOTATION UPDATE" in uprline
+    ):
+        # Old style DT line
+        # =================
+        # e.g.
+        # DT   01-FEB-1995 (Rel. 31, Created)
+        # DT   01-FEB-1995 (Rel. 31, Last sequence update)
+        # DT   01-OCT-2000 (Rel. 40, Last annotation update)
+        #
+        # or:
+        # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
+        # ...
+
+        # find where the version information will be located
+        # This is needed for when you have cases like IPI where
+        # the release version is in a different spot:
+        # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
+        uprcols = uprline.split()
+        rel_index = -1
+        for index in range(len(uprcols)):
+            if "REL." in uprcols[index]:
+                rel_index = index
+        assert rel_index >= 0, "Could not find Rel. in DT line: %s" % line
+        version_index = rel_index + 1
+        # get the version information
+        str_version = cols[version_index].rstrip(",")
+        # no version number
+        if str_version == "":
+            version = 0
+        # dot versioned
+        elif "." in str_version:
+            version = str_version
+        # integer versioned
+        else:
+            version = int(str_version)
+        date = cols[0]
+
+        if "CREATED" in uprline:
+            record.created = date, version
+        elif "LAST SEQUENCE UPDATE" in uprline:
+            record.sequence_update = date, version
+        elif "LAST ANNOTATION UPDATE" in uprline:
+            record.annotation_update = date, version
+        else:
+            raise SwissProtParserError("Unrecognised DT (DaTe) line", line=line)
+    elif (
+        "INTEGRATED INTO" in uprline
+        or "SEQUENCE VERSION" in uprline
+        or "ENTRY VERSION" in uprline
+    ):
+        # New style DT line
+        # =================
+        # As of UniProt Knowledgebase release 7.0 (including
+        # Swiss-Prot release 49.0 and TrEMBL release 32.0) the
+        # format of the DT lines and the version information
+        # in them was changed - the release number was dropped.
+        #
+        # For more information see bug 1948 and
+        # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0
+        #
+        # e.g.
+        # DT   01-JAN-1998, integrated into UniProtKB/Swiss-Prot.
+        # DT   15-OCT-2001, sequence version 3.
+        # DT   01-APR-2004, entry version 14.
+        #
+        # This is a new style DT line...
+
+        # The date should be in string cols[1]
+        # Get the version number if there is one.
+        # For the three DT lines above: 0, 3, 14
+        try:
+            version = 0
+            for s in cols[-1].split("."):
+                if s.isdigit():
+                    version = int(s)
+        except ValueError:
+            version = 0
+        date = cols[0].rstrip(",")
+
+        # Re-use the historical property names, even though
+        # the meaning has changed slighty:
+        if "INTEGRATED" in uprline:
+            record.created = date, version
+        elif "SEQUENCE VERSION" in uprline:
+            record.sequence_update = date, version
+        elif "ENTRY VERSION" in uprline:
+            record.annotation_update = date, version
+        else:
+            raise SwissProtParserError("Unrecognised DT (DaTe) line", line=line)
+    else:
+        raise SwissProtParserError("Failed to parse DT (DaTe) line", line=line)
+
+
+def _read_ox(record, line):
+    # The OX line used to be in the simple format:
+    # OX   DESCRIPTION=ID[, ID]...;
+    # If there are too many id's to fit onto a line, then the ID's
+    # continue directly onto the next line, e.g.
+    # OX   DESCRIPTION=ID[, ID]...
+    # OX   ID[, ID]...;
+    # Currently, the description is always "NCBI_TaxID".
+    # To parse this, I need to check to see whether I'm at the
+    # first line.  If I am, grab the description and make sure
+    # it's an NCBI ID.  Then, grab all the id's.
+    #
+    # As of the 2014-10-01 release, there may be an evidence code, e.g.
+    # OX   NCBI_TaxID=418404 {ECO:0000313|EMBL:AEX14553.1};
+    # In the short term, we will ignore any evidence codes:
+    line = line.split("{")[0]
+    if record.taxonomy_id:
+        ids = line[5:].rstrip().rstrip(";")
+    else:
+        descr, ids = line[5:].rstrip().rstrip(";").split("=")
+        assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
+    record.taxonomy_id.extend(ids.split(", "))
+
+
+def _read_oh(record, line):
+    # Line type OH (Organism Host) for viral hosts
+    assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line
+    line = line[16:].rstrip()
+    assert line[-1] == "." and line.count(";") == 1, line
+    taxid, name = line[:-1].split(";")
+    record.host_taxonomy_id.append(taxid.strip())
+    record.host_organism.append(name.strip())
+
+
+def _read_rn(reference, rn):
+    # This used to be a very simple line with a reference number, e.g.
+    # RN   [1]
+    # As of the 2014-10-01 release, there may be an evidence code, e.g.
+    # RN   [1] {ECO:0000313|EMBL:AEX14553.1}
+    words = rn.split(None, 1)
+    number = words[0]
+    assert number.startswith("[") and number.endswith("]"), (
+        "Missing brackets %s" % number
+    )
+    reference.number = int(number[1:-1])
+    if len(words) > 1:
+        evidence = words[1]
+        assert evidence.startswith("{") and evidence.endswith("}"), (
+            "Missing braces %s" % evidence
+        )
+        reference.evidence = evidence[1:-1].split("|")
+
+
+def _read_rc(reference, value):
+    cols = value.split(";")
+    if value[-1] == ";":
+        unread = ""
+    else:
+        cols, unread = cols[:-1], cols[-1]
+    for col in cols:
+        if not col:  # last column will be the empty string
+            return
+        # The token is everything before the first '=' character.
+        i = col.find("=")
+        if i >= 0:
+            token, text = col[:i], col[i + 1 :]
+            comment = token.lstrip(), text
+            reference.comments.append(comment)
+        else:
+            comment = reference.comments[-1]
+            comment = "%s %s" % (comment, col)
+            reference.comments[-1] = comment
+    return unread
+
+
+def _read_rx(reference, value):
+    # The basic (older?) RX line is of the form:
+    # RX   MEDLINE; 85132727.
+    # but there are variants of this that need to be dealt with (see below)
+
+    # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33
+    # have extraneous information in the RX line.  Check for
+    # this and chop it out of the line.
+    # (noticed by katel@worldpath.net)
+    value = value.replace(" [NCBI, ExPASy, Israel, Japan]", "")
+
+    # RX lines can also be used of the form
+    # RX   PubMed=9603189;
+    # reported by edvard@farmasi.uit.no
+    # and these can be more complicated like:
+    # RX   MEDLINE=95385798; PubMed=7656980;
+    # RX   PubMed=15060122; DOI=10.1136/jmg 2003.012781;
+    # We look for these cases first and deal with them
+    warn = False
+    if "=" in value:
+        cols = value.split("; ")
+        cols = [x.strip() for x in cols]
+        cols = [x for x in cols if x]
+        for col in cols:
+            x = col.split("=")
+            if len(x) != 2 or x == ("DOI", "DOI"):
+                warn = True
+                break
+            assert len(x) == 2, "I don't understand RX line %s" % value
+            reference.references.append((x[0], x[1].rstrip(";")))
+    # otherwise we assume we have the type 'RX   MEDLINE; 85132727.'
+    else:
+        cols = value.split("; ")
+        # normally we split into the three parts
+        if len(cols) != 2:
+            warn = True
+        else:
+            reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
+    if warn:
+        import warnings
+        from Bio import BiopythonParserWarning
+
+        warnings.warn("Possibly corrupt RX line %r" % value, BiopythonParserWarning)
+
+
+def _read_cc(record, line):
+    key, value = line[5:8], line[9:].rstrip()
+    if key == "-!-":  # Make a new comment
+        record.comments.append(value)
+    elif key == "   ":  # add to the previous comment
+        if not record.comments:
+            # TCMO_STRGA in Release 37 has comment with no topic
+            record.comments.append(value)
+        else:
+            record.comments[-1] += " " + value
+
+
+def _read_dr(record, value):
+    cols = value.rstrip(".").split("; ")
+    record.cross_references.append(tuple(cols))
+
+
+def _read_pe(record, value):
+    pe = value.split(":")
+    record.protein_existence = int(pe[0])
+
+
+def _read_kw(record, value):
+    # Old style - semi-colon separated, multi-line. e.g. Q13639.txt
+    # KW   Alternative splicing; Cell membrane; Complete proteome;
+    # KW   Disulfide bond; Endosome; G-protein coupled receptor; Glycoprotein;
+    # KW   Lipoprotein; Membrane; Palmitate; Polymorphism; Receptor; Transducer;
+    # KW   Transmembrane.
+    #
+    # New style as of 2014-10-01 release with evidence codes, e.g. H2CNN8.txt
+    # KW   Monooxygenase {ECO:0000313|EMBL:AEX14553.1};
+    # KW   Oxidoreductase {ECO:0000313|EMBL:AEX14553.1}.
+    # For now to match the XML parser, drop the evidence codes.
+    for value in value.rstrip(";.").split("; "):
+        if value.endswith("}"):
+            # Discard the evidence code
+            value = value.rsplit("{", 1)[0]
+        record.keywords.append(value.strip())
+
+
+def _read_ft(record, line):
+    name = line[5:13].rstrip()
+    if name:
+        if line[13:21] == "        ":  # new-style FT line
+            location = line[21:80].rstrip()
+            try:
+                isoform_id, location = location.split(":")
+            except ValueError:
+                isoform_id = None
+            try:
+                from_res, to_res = location.split("..")
+            except ValueError:
+                from_res = location
+                to_res = ""
+            qualifiers = {}
+        else:  # old-style FT line
+            from_res = line[14:20].lstrip()
+            to_res = line[21:27].lstrip()
+            isoform_id = None
+            description = line[34:75].rstrip()
+            qualifiers = {"description": description}
+        if from_res == "?":
+            from_res = UnknownPosition()
+        elif from_res.startswith("?"):
+            position = int(from_res[1:]) - 1  # Python zero-based counting
+            from_res = UncertainPosition(position)
+        elif from_res.startswith("<"):
+            position = int(from_res[1:]) - 1  # Python zero-based counting
+            from_res = BeforePosition(position)
+        else:
+            position = int(from_res) - 1  # Python zero-based counting
+            from_res = ExactPosition(position)
+        if to_res == "":
+            position = from_res + 1
+            to_res = ExactPosition(position)
+        elif to_res == "?":
+            to_res = UnknownPosition()
+        elif to_res.startswith("?"):
+            position = int(to_res[1:])
+            to_res = UncertainPosition(position)
+        elif to_res.startswith(">"):
+            position = int(to_res[1:])
+            to_res = AfterPosition(position)
+        else:
+            position = int(to_res)
+            to_res = ExactPosition(position)
+        location = FeatureLocation(from_res, to_res, ref=isoform_id)
+        feature = FeatureTable(
+            location=location, type=name, id=None, qualifiers=qualifiers
+        )
+        record.features.append(feature)
+        return
+    # this line is a continuation of the previous feature
+    feature = record.features[-1]
+    if line[5:34] == "                             ":  # old-style FT line
+        description = line[34:75].rstrip()
+        if description.startswith("/FTId="):
+            # store the FTId as the feature ID
+            feature.id = description[6:].rstrip(".")
+            return
+        # this line is a continuation of the description of the previous feature
+        old_description = feature.qualifiers["description"]
+        if old_description.endswith("-"):
+            description = "%s%s" % (old_description, description)
+        else:
+            description = "%s %s" % (old_description, description)
+
+        if feature.type in ("VARSPLIC", "VAR_SEQ"):  # special case
+            # Remove unwanted spaces in sequences.
+            # During line carryover, the sequences in VARSPLIC/VAR_SEQ can get
+            # mangled with unwanted spaces like:
+            # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
+            # We want to check for this case and correct it as it happens.
+            try:
+                first_seq, second_seq = description.split(" -> ")
+            except ValueError:
+                pass
+            else:
+                extra_info = ""
+                # we might have more information at the end of the
+                # second sequence, which should be in parenthesis
+                extra_info_pos = second_seq.find(" (")
+                if extra_info_pos != -1:
+                    extra_info = second_seq[extra_info_pos:]
+                    second_seq = second_seq[:extra_info_pos]
+                # now clean spaces out of the first and second string
+                first_seq = first_seq.replace(" ", "")
+                second_seq = second_seq.replace(" ", "")
+                # reassemble the description
+                description = first_seq + " -> " + second_seq + extra_info
+        feature.qualifiers["description"] = description
+    else:  # new-style FT line
+        value = line[21:].rstrip()
+        if value.startswith("/id="):
+            qualifier_type = "id"
+            value = value[4:]
+            assert value.startswith('"')
+            assert value.endswith('"')
+            feature.id = value[1:-1]
+            return
+        elif value.startswith("/evidence="):
+            value = value[10:]
+            assert value.startswith('"')
+            if value.endswith('"'):
+                value = value[1:-1]
+            else:  # continues on the next line
+                value = value[1:]
+            assert "evidence" not in feature.qualifiers
+            feature.qualifiers["evidence"] = value
+            return
+        elif value.startswith("/note="):
+            value = value[6:]
+            assert value.startswith('"')
+            if value.endswith('"'):
+                value = value[1:-1]
+            else:  # continues on the next line
+                value = value[1:]
+            assert "note" not in feature.qualifiers
+            feature.qualifiers["note"] = value
+            return
+        # this line is a continuation of the description of the previous feature
+        keys = list(feature.qualifiers.keys())
+        key = keys[-1]
+        description = value.rstrip('"')
+        old_description = feature.qualifiers[key]
+        if key == "evidence" or old_description.endswith("-"):
+            description = "%s%s" % (old_description, description)
+        else:
+            description = "%s %s" % (old_description, description)
+        if feature.type == "VAR_SEQ":  # see VARSPLIC above
+            try:
+                first_seq, second_seq = description.split(" -> ")
+            except ValueError:
+                pass
+            else:
+                extra_info = ""
+                # we might have more information at the end of the
+                # second sequence, which should be in parenthesis
+                extra_info_pos = second_seq.find(" (")
+                if extra_info_pos != -1:
+                    extra_info = second_seq[extra_info_pos:]
+                    second_seq = second_seq[:extra_info_pos]
+                # now clean spaces out of the first and second string
+                first_seq = first_seq.replace(" ", "")
+                second_seq = second_seq.replace(" ", "")
+                # reassemble the description
+                description = first_seq + " -> " + second_seq + extra_info
+        feature.qualifiers[key] = description
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/SwissProt/__pycache__/KeyWList.cpython-37.pyc b/code/lib/Bio/SwissProt/__pycache__/KeyWList.cpython-37.pyc
new file mode 100644
index 0000000..431122b
Binary files /dev/null and b/code/lib/Bio/SwissProt/__pycache__/KeyWList.cpython-37.pyc differ
diff --git a/code/lib/Bio/SwissProt/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/SwissProt/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..24f622f
Binary files /dev/null and b/code/lib/Bio/SwissProt/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/TogoWS/__init__.py b/code/lib/Bio/TogoWS/__init__.py
new file mode 100644
index 0000000..d7072d1
--- /dev/null
+++ b/code/lib/Bio/TogoWS/__init__.py
@@ -0,0 +1,376 @@
+# Copyright 2010-2011, 2013-2014, 2016-2018 by Peter Cock.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Provides code to access the TogoWS integrated websevices of DBCLS, Japan.
+
+This module aims to make the TogoWS (from DBCLS, Japan) easier to use. See:
+http://togows.dbcls.jp/
+
+The TogoWS REST service provides simple access to a range of databases, acting
+as a proxy to shield you from all the different provider APIs. This works using
+simple URLs (which this module will construct for you). For more details, see
+http://togows.dbcls.jp/site/en/rest.html
+
+The functionality is somewhat similar to Biopython's Bio.Entrez module which
+provides access to the NCBI's Entrez Utilities (E-Utils) which also covers a
+wide range of databases.
+
+Currently TogoWS does not provide any usage guidelines (unlike the NCBI whose
+requirements are reasonably clear). To avoid risking overloading the service,
+Biopython will only allow three calls per second.
+
+The TogoWS SOAP service offers a more complex API for calling web services
+(essentially calling remote functions) provided by DDBJ, KEGG and PDBj. For
+example, this allows you to run a remote BLAST search at the DDBJ. This is
+not yet covered by this module, however there are lots of Python examples
+on the TogoWS website using the SOAPpy python library. See:
+http://togows.dbcls.jp/site/en/soap.html
+http://soapy.sourceforge.net/
+"""
+
+
+import io
+import time
+
+from urllib.request import urlopen
+from urllib.parse import quote
+
+
+# Constant
+_BASE_URL = "http://togows.dbcls.jp"
+
+# Caches:
+_search_db_names = None
+_entry_db_names = None
+_entry_db_fields = {}
+_entry_db_formats = {}
+_convert_formats = []
+
+
+def _get_fields(url):
+    """Query a TogoWS URL for a plain text list of values (PRIVATE)."""
+    handle = _open(url)
+    fields = handle.read().strip().split()
+    handle.close()
+    return fields
+
+
+def _get_entry_dbs():
+    return _get_fields(_BASE_URL + "/entry")
+
+
+def _get_entry_fields(db):
+    return _get_fields(_BASE_URL + "/entry/%s?fields" % db)
+
+
+def _get_entry_formats(db):
+    return _get_fields(_BASE_URL + "/entry/%s?formats" % db)
+
+
+def _get_convert_formats():
+    return [pair.split(".") for pair in _get_fields(_BASE_URL + "/convert/")]
+
+
+def entry(db, id, format=None, field=None):
+    """Call TogoWS 'entry' to fetch a record.
+
+    Arguments:
+     - db - database (string), see list below.
+     - id - identier (string) or a list of identifiers (either as a list of
+       strings or a single string with comma separators).
+     - format - return data file format (string), options depend on the database
+       e.g. "xml", "json", "gff", "fasta", "ttl" (RDF Turtle)
+     - field - specific field from within the database record (string)
+       e.g. "au" or "authors" for pubmed.
+
+    At the time of writing, this includes the following::
+
+        KEGG: compound, drug, enzyme, genes, glycan, orthology, reaction,
+              module, pathway
+        DDBj: ddbj, dad, pdb
+        NCBI: nuccore, nucest, nucgss, nucleotide, protein, gene, onim,
+              homologue, snp, mesh, pubmed
+        EBI:  embl, uniprot, uniparc, uniref100, uniref90, uniref50
+
+    For the current list, please see http://togows.dbcls.jp/entry/
+
+    This function is essentially equivalent to the NCBI Entrez service
+    EFetch, available in Biopython as Bio.Entrez.efetch(...), but that
+    does not offer field extraction.
+    """
+    global _entry_db_names, _entry_db_fields, fetch_db_formats
+    if _entry_db_names is None:
+        _entry_db_names = _get_entry_dbs()
+    if db not in _entry_db_names:
+        raise ValueError(
+            "TogoWS entry fetch does not officially support database '%s'." % db
+        )
+    if field:
+        try:
+            fields = _entry_db_fields[db]
+        except KeyError:
+            fields = _get_entry_fields(db)
+            _entry_db_fields[db] = fields
+        if db == "pubmed" and field == "ti" and "title" in fields:
+            # Backwards compatibility fix for TogoWS change Nov/Dec 2013
+            field = "title"
+            import warnings
+
+            warnings.warn(
+                "TogoWS dropped 'pubmed' field alias 'ti', please use 'title' instead."
+            )
+        if field not in fields:
+            raise ValueError(
+                "TogoWS entry fetch does not explicitly support "
+                "field '%s' for database '%s'. Only: %s"
+                % (field, db, ", ".join(sorted(fields)))
+            )
+    if format:
+        try:
+            formats = _entry_db_formats[db]
+        except KeyError:
+            formats = _get_entry_formats(db)
+            _entry_db_formats[db] = formats
+        if format not in formats:
+            raise ValueError(
+                "TogoWS entry fetch does not explicitly support "
+                "format '%s' for database '%s'. Only: %s"
+                % (format, db, ", ".join(sorted(formats)))
+            )
+
+    if isinstance(id, list):
+        id = ",".join(id)
+    url = _BASE_URL + "/entry/%s/%s" % (db, quote(id))
+    if field:
+        url += "/" + field
+    if format:
+        url += "." + format
+    return _open(url)
+
+
+def search_count(db, query):
+    """Call TogoWS search count to see how many matches a search gives.
+
+    Arguments:
+     - db - database (string), see http://togows.dbcls.jp/search
+     - query - search term (string)
+
+    You could then use the count to download a large set of search results in
+    batches using the offset and limit options to Bio.TogoWS.search(). In
+    general however the Bio.TogoWS.search_iter() function is simpler to use.
+    """
+    global _search_db_names
+    if _search_db_names is None:
+        _search_db_names = _get_fields(_BASE_URL + "/search")
+    if db not in _search_db_names:
+        # TODO - Make this a ValueError? Right now despite the HTML website
+        # claiming to, the "gene" or "ncbi-gene" don't work and are not listed.
+        import warnings
+
+        warnings.warn(
+            "TogoWS search does not officially support database '%s'. "
+            "See %s/search/ for options." % (db, _BASE_URL)
+        )
+    url = _BASE_URL + "/search/%s/%s/count" % (db, quote(query))
+    handle = _open(url)
+    data = handle.read()
+    handle.close()
+    if not data:
+        raise ValueError("TogoWS returned no data from URL %s" % url)
+    try:
+        return int(data.strip())
+    except ValueError:
+        raise ValueError(
+            "Expected an integer from URL %s, got: %r" % (url, data)
+        ) from None
+
+
+def search_iter(db, query, limit=None, batch=100):
+    """Call TogoWS search iterating over the results (generator function).
+
+    Arguments:
+     - db - database (string), see http://togows.dbcls.jp/search
+     - query - search term (string)
+     - limit - optional upper bound on number of search results
+     - batch - number of search results to pull back each time talk to
+       TogoWS (currently limited to 100).
+
+    You would use this function within a for loop, e.g.
+
+    >>> from Bio import TogoWS
+    >>> for id in TogoWS.search_iter("pubmed", "diabetes+human", limit=10):
+    ...     print("PubMed ID: %s" %id) # maybe fetch data with entry?
+    PubMed ID: ...
+
+    Internally this first calls the Bio.TogoWS.search_count() and then
+    uses Bio.TogoWS.search() to get the results in batches.
+    """
+    count = search_count(db, query)
+    if not count:
+        return
+    # NOTE - We leave it to TogoWS to enforce any upper bound on each
+    # batch, they currently return an HTTP 400 Bad Request if above 100.
+    remain = count
+    if limit is not None:
+        remain = min(remain, limit)
+    offset = 1  # They don't use zero based counting
+    prev_ids = []  # Just cache the last batch for error checking
+    while remain:
+        batch = min(batch, remain)
+        # print("%r left, asking for %r" % (remain, batch))
+        ids = search(db, query, offset, batch).read().strip().split()
+        assert len(ids) == batch, "Got %i, expected %i" % (len(ids), batch)
+        # print("offset %i, %s ... %s" % (offset, ids[0], ids[-1]))
+        if ids == prev_ids:
+            raise RuntimeError("Same search results for previous offset")
+        for identifier in ids:
+            if identifier in prev_ids:
+                raise RuntimeError("Result %s was in previous batch" % identifier)
+            yield identifier
+        offset += batch
+        remain -= batch
+        prev_ids = ids
+
+
+def search(db, query, offset=None, limit=None, format=None):
+    """Call TogoWS search.
+
+    This is a low level wrapper for the TogoWS search function, which
+    can return results in a several formats. In general, the search_iter
+    function is more suitable for end users.
+
+    Arguments:
+     - db - database (string), see http://togows.dbcls.jp/search/
+     - query - search term (string)
+     - offset, limit - optional integers specifying which result to start from
+       (1 based) and the number of results to return.
+     - format - return data file format (string), e.g. "json", "ttl" (RDF)
+       By default plain text is returned, one result per line.
+
+    At the time of writing, TogoWS applies a default count limit of 100
+    search results, and this is an upper bound. To access more results,
+    use the offset argument or the search_iter(...) function.
+
+    TogoWS supports a long list of databases, including many from the NCBI
+    (e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or "genbank", and
+    "ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot" or
+    "uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound").
+    For the current list, see http://togows.dbcls.jp/search/
+
+    The NCBI provide the Entrez Search service (ESearch) which is similar,
+    available in Biopython as the Bio.Entrez.esearch() function.
+
+    See also the function Bio.TogoWS.search_count() which returns the number
+    of matches found, and the Bio.TogoWS.search_iter() function which allows
+    you to iterate over the search results (taking care of batching for you).
+    """
+    global _search_db_names
+    if _search_db_names is None:
+        _search_db_names = _get_fields(_BASE_URL + "/search")
+    if db not in _search_db_names:
+        # TODO - Make this a ValueError? Right now despite the HTML website
+        # claiming to, the "gene" or "ncbi-gene" don't work and are not listed.
+        import warnings
+
+        warnings.warn(
+            "TogoWS search does not explicitly support database '%s'. "
+            "See %s/search/ for options." % (db, _BASE_URL)
+        )
+    url = _BASE_URL + "/search/%s/%s" % (db, quote(query))
+    if offset is not None and limit is not None:
+        try:
+            offset = int(offset)
+        except ValueError:
+            raise ValueError(
+                "Offset should be an integer (at least one), not %r" % offset
+            ) from None
+        try:
+            limit = int(limit)
+        except ValueError:
+            raise ValueError(
+                "Limit should be an integer (at least one), not %r" % limit
+            ) from None
+        if offset <= 0:
+            raise ValueError("Offset should be at least one, not %i" % offset)
+        if limit <= 0:
+            raise ValueError("Count should be at least one, not %i" % limit)
+        url += "/%i,%i" % (offset, limit)
+    elif offset is not None or limit is not None:
+        raise ValueError("Expect BOTH offset AND limit to be provided (or neither)")
+    if format:
+        url += "." + format
+    # print(url)
+    return _open(url)
+
+
+def convert(data, in_format, out_format):
+    """Call TogoWS for file format conversion.
+
+    Arguments:
+     - data - string or handle containing input record(s)
+     - in_format - string describing the input file format (e.g. "genbank")
+     - out_format - string describing the requested output format (e.g. "fasta")
+
+    For a list of supported conversions (e.g. "genbank" to "fasta"), see
+    http://togows.dbcls.jp/convert/
+
+    Note that Biopython has built in support for conversion of sequence and
+    alignnent file formats (functions Bio.SeqIO.convert and Bio.AlignIO.convert)
+    """
+    global _convert_formats
+    if not _convert_formats:
+        _convert_formats = _get_convert_formats()
+    if [in_format, out_format] not in _convert_formats:
+        msg = "\n".join("%s -> %s" % tuple(pair) for pair in _convert_formats)
+        raise ValueError("Unsupported conversion. Choose from:\n%s" % msg)
+    url = _BASE_URL + "/convert/%s.%s" % (in_format, out_format)
+    # TODO - Should we just accept a string not a handle? What about a filename?
+    try:
+        # Handle
+        data = data.read()
+    except AttributeError:
+        # String
+        pass
+    return _open(url, post=data)
+
+
+def _open(url, post=None):
+    """Build the URL and open a handle to it (PRIVATE).
+
+    Open a handle to TogoWS, will raise an IOError if it encounters an error.
+
+    In the absence of clear guidelines, this function enforces a limit of
+    "up to three queries per second" to avoid abusing the TogoWS servers.
+    """
+    delay = 0.333333333  # one third of a second
+    current = time.time()
+    wait = _open.previous + delay - current
+    if wait > 0:
+        time.sleep(wait)
+        _open.previous = current + wait
+    else:
+        _open.previous = current
+
+    if post:
+        handle = urlopen(url, post.encode())
+    else:
+        handle = urlopen(url)
+
+    # We now trust TogoWS to have set an HTTP error code, that
+    # suffices for my current unit tests. Previously we would
+    # examine the start of the data returned back.
+    text_handle = io.TextIOWrapper(handle, encoding="UTF-8")
+    text_handle.url = handle.url
+    return text_handle
+
+
+_open.previous = 0
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/TogoWS/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/TogoWS/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..13fb541
Binary files /dev/null and b/code/lib/Bio/TogoWS/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/UniGene/__init__.py b/code/lib/Bio/UniGene/__init__.py
new file mode 100644
index 0000000..6629c87
--- /dev/null
+++ b/code/lib/Bio/UniGene/__init__.py
@@ -0,0 +1,340 @@
+# Copyright 2006 by Sean Davis, National Cancer Institute, NIH.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Parse Unigene flat file format files such as the Hs.data file.
+
+Here is an overview of the flat file format that this parser deals with:
+
+   Line types/qualifiers::
+
+       ID           UniGene cluster ID
+       TITLE        Title for the cluster
+       GENE         Gene symbol
+       CYTOBAND     Cytological band
+       EXPRESS      Tissues of origin for ESTs in cluster
+       RESTR_EXPR   Single tissue or development stage contributes
+                    more than half the total EST frequency for this gene.
+       GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
+                    T if a non-templated polyA tail is found among
+                    a cluster's sequences; else
+                    I if templated As are found in genomic sequence or
+                    S if a canonical polyA signal is found on
+                      the genomic sequence
+       GENE_ID      Entrez gene identifier associated with at least one
+                    sequence in this cluster;
+                    to be used instead of LocusLink.
+       LOCUSLINK    LocusLink identifier associated with at least one
+                    sequence in this cluster;
+                    deprecated in favor of GENE_ID
+       HOMOL        Homology;
+       CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping
+                    on the arabidopsis genome.
+       STS          STS
+            ACC=         GenBank/EMBL/DDBJ accession number of STS
+                         [optional field]
+            UNISTS=      identifier in NCBI's UNISTS database
+       TXMAP        Transcript map interval
+            MARKER=      Marker found on at least one sequence in this
+                         cluster
+            RHPANEL=     Radiation Hybrid panel used to place marker
+       PROTSIM      Protein Similarity data for the sequence with
+                    highest-scoring protein similarity in this cluster
+            ORG=         Organism
+            PROTGI=      Sequence GI of protein
+            PROTID=      Sequence ID of protein
+            PCT=         Percent alignment
+            ALN=         length of aligned region (aa)
+       SCOUNT       Number of sequences in the cluster
+       SEQUENCE     Sequence
+            ACC=         GenBank/EMBL/DDBJ accession number of sequence
+            NID=         Unique nucleotide sequence identifier (gi)
+            PID=         Unique protein sequence identifier (used for
+                         non-ESTs)
+            CLONE=       Clone identifier (used for ESTs only)
+            END=         End (5'/3') of clone insert read (used for
+                         ESTs only)
+            LID=         Library ID; see Hs.lib.info for library name
+                         and tissue
+            MGC=         5' CDS-completeness indicator; if present, the
+                         clone associated with this sequence is believed
+                         CDS-complete. A value greater than 511 is the gi
+                         of the CDS-complete mRNA matched by the EST,
+                         otherwise the value is an indicator of the
+                         reliability of the test indicating CDS
+                         completeness; higher values indicate more
+                         reliable CDS-completeness predictions.
+           SEQTYPE=      Description of the nucleotide sequence.
+                         Possible values are mRNA, EST and HTC.
+           TRACE=        The Trace ID of the EST sequence, as provided by
+                         NCBI Trace Archive
+
+"""
+
+
+class SequenceLine:
+    """Store the information for one SEQUENCE line from a Unigene file.
+
+    Initialize with the text part of the SEQUENCE line, or nothing.
+
+    Attributes and descriptions (access as LOWER CASE):
+     - ACC=         GenBank/EMBL/DDBJ accession number of sequence
+     - NID=         Unique nucleotide sequence identifier (gi)
+     - PID=         Unique protein sequence identifier (used for non-ESTs)
+     - CLONE=       Clone identifier (used for ESTs only)
+     - END=         End (5'/3') of clone insert read (used for ESTs only)
+     - LID=         Library ID; see Hs.lib.info for library name and tissue
+     - MGC=         5' CDS-completeness indicator; if present,
+       the clone associated with this sequence
+       is believed CDS-complete. A value greater than 511
+       is the gi of the CDS-complete mRNA matched by the EST,
+       otherwise the value is an indicator of the reliability
+       of the test indicating CDS completeness;
+       higher values indicate more reliable CDS-completeness
+       predictions.
+     - SEQTYPE=     Description of the nucleotide sequence. Possible values
+       are mRNA, EST and HTC.
+     - TRACE=       The Trace ID of the EST sequence, as provided by NCBI
+       Trace Archive
+
+    """
+
+    def __init__(self, text=None):
+        """Initialize the class."""
+        self.acc = ""
+        self.nid = ""
+        self.lid = ""
+        self.pid = ""
+        self.clone = ""
+        self.image = ""
+        self.is_image = False
+        self.end = ""
+        self.mgc = ""
+        self.seqtype = ""
+        self.trace = ""
+        if text is not None:
+            self.text = text
+            self._init_from_text(text)
+
+    def _init_from_text(self, text):
+        parts = text.split("; ")
+        for part in parts:
+            key, val = part.split("=")
+            if key == "CLONE":
+                if val[:5] == "IMAGE":
+                    self.is_image = True
+                    self.image = val[6:]
+            setattr(self, key.lower(), val)
+
+    def __repr__(self):
+        """Return UniGene SequenceLine object as a string."""
+        return self.text
+
+
+class ProtsimLine:
+    """Store the information for one PROTSIM line from a Unigene file.
+
+    Initialize with the text part of the PROTSIM line, or nothing.
+
+    Attributes and descriptions (access as LOWER CASE)
+    ORG=         Organism
+    PROTGI=      Sequence GI of protein
+    PROTID=      Sequence ID of protein
+    PCT=         Percent alignment
+    ALN=         length of aligned region (aa)
+    """
+
+    def __init__(self, text=None):
+        """Initialize the class."""
+        self.org = ""
+        self.protgi = ""
+        self.protid = ""
+        self.pct = ""
+        self.aln = ""
+        if text is not None:
+            self.text = text
+            self._init_from_text(text)
+
+    def _init_from_text(self, text):
+        parts = text.split("; ")
+
+        for part in parts:
+            key, val = part.split("=")
+            setattr(self, key.lower(), val)
+
+    def __repr__(self):
+        """Return UniGene ProtsimLine object as a string."""
+        return self.text
+
+
+class STSLine:
+    """Store the information for one STS line from a Unigene file.
+
+    Initialize with the text part of the STS line, or nothing.
+
+    Attributes and descriptions (access as LOWER CASE)
+
+    ACC=         GenBank/EMBL/DDBJ accession number of STS [optional field]
+    UNISTS=      identifier in NCBI's UNISTS database
+    """
+
+    def __init__(self, text=None):
+        """Initialize the class."""
+        self.acc = ""
+        self.unists = ""
+        if text is not None:
+            self.text = text
+            self._init_from_text(text)
+
+    def _init_from_text(self, text):
+        parts = text.split(" ")
+
+        for part in parts:
+            key, val = part.split("=")
+            setattr(self, key.lower(), val)
+
+    def __repr__(self):
+        """Return UniGene STSLine object as a string."""
+        return self.text
+
+
+class Record:
+    """Store a Unigene record.
+
+    Here is what is stored::
+
+        self.ID           = ''  # ID line
+        self.species      = ''  # Hs, Bt, etc.
+        self.title        = ''  # TITLE line
+        self.symbol       = ''  # GENE line
+        self.cytoband     = ''  # CYTOBAND line
+        self.express      = []  # EXPRESS line, parsed on ';'
+                                # Will be an array of strings
+        self.restr_expr   = ''  # RESTR_EXPR line
+        self.gnm_terminus = ''  # GNM_TERMINUS line
+        self.gene_id      = ''  # GENE_ID line
+        self.locuslink    = ''  # LOCUSLINK line
+        self.homol        = ''  # HOMOL line
+        self.chromosome   = ''  # CHROMOSOME line
+        self.protsim      = []  # PROTSIM entries, array of Protsims
+                                # Type ProtsimLine
+        self.sequence     = []  # SEQUENCE entries, array of Sequence entries
+                                # Type SequenceLine
+        self.sts          = []  # STS entries, array of STS entries
+                                # Type STSLine
+        self.txmap        = []  # TXMAP entries, array of TXMap entries
+
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.ID = ""  # ID line
+        self.species = ""  # Hs, Bt, etc.
+        self.title = ""  # TITLE line
+        self.symbol = ""  # GENE line
+        self.cytoband = ""  # CYTOBAND line
+        self.express = []  # EXPRESS line, parsed on ';'
+        self.restr_expr = ""  # RESTR_EXPR line
+        self.gnm_terminus = ""  # GNM_TERMINUS line
+        self.gene_id = ""  # GENE_ID line
+        self.locuslink = ""  # LOCUSLINK line
+        self.homol = ""  # HOMOL line
+        self.chromosome = ""  # CHROMOSOME line
+        self.protsim = []  # PROTSIM entries, array of Protsims
+        self.sequence = []  # SEQUENCE entries, array of Sequence entries
+        self.sts = []  # STS entries, array of STS entries
+        self.txmap = []  # TXMAP entries, array of TXMap entries
+
+    def __repr__(self):
+        """Represent the UniGene Record object as a string for debugging."""
+        return "<%s> %s %s %s" % (
+            self.__class__.__name__,
+            self.ID,
+            self.symbol,
+            self.title,
+        )
+
+
+def parse(handle):
+    """Read and load a UniGene records, for files containing multiple records."""
+    while True:
+        record = _read(handle)
+        if not record:
+            return
+        yield record
+
+
+def read(handle):
+    """Read and load a UniGene record, one record per file."""
+    record = _read(handle)
+    if not record:
+        raise ValueError("No SwissProt record found")
+    # We should have reached the end of the record by now
+    remainder = handle.read()
+    if remainder:
+        raise ValueError("More than one SwissProt record found")
+    return record
+
+
+# Everything below is private
+
+
+def _read(handle):
+    UG_INDENT = 12
+    record = None
+    for line in handle:
+        tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
+        line = line.rstrip()
+        if tag == "ID":
+            record = Record()
+            record.ID = value
+            record.species = record.ID.split(".")[0]
+        elif tag == "TITLE":
+            record.title = value
+        elif tag == "GENE":
+            record.symbol = value
+        elif tag == "GENE_ID":
+            record.gene_id = value
+        elif tag == "LOCUSLINK":
+            record.locuslink = value
+        elif tag == "HOMOL":
+            if value == "YES":
+                record.homol = True
+            elif value == "NO":
+                record.homol = True
+            else:
+                raise ValueError("Cannot parse HOMOL line %s" % line)
+        elif tag == "EXPRESS":
+            record.express = [word.strip() for word in value.split("|")]
+        elif tag == "RESTR_EXPR":
+            record.restr_expr = [word.strip() for word in value.split("|")]
+        elif tag == "CHROMOSOME":
+            record.chromosome = value
+        elif tag == "CYTOBAND":
+            record.cytoband = value
+        elif tag == "PROTSIM":
+            protsim = ProtsimLine(value)
+            record.protsim.append(protsim)
+        elif tag == "SCOUNT":
+            scount = int(value)
+        elif tag == "SEQUENCE":
+            sequence = SequenceLine(value)
+            record.sequence.append(sequence)
+        elif tag == "STS":
+            sts = STSLine(value)
+            record.sts.append(sts)
+        elif tag == "//":
+            if len(record.sequence) != scount:
+                raise ValueError(
+                    "The number of sequences specified in the record "
+                    "(%d) does not agree with the number of sequences found (%d)"
+                    % (scount, len(record.sequence))
+                )
+            return record
+        else:
+            raise ValueError("Unknown tag %s" % tag)
+    if record:
+        raise ValueError("Unexpected end of stream.")
diff --git a/code/lib/Bio/UniGene/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/UniGene/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..8eb4548
Binary files /dev/null and b/code/lib/Bio/UniGene/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/UniProt/GOA.py b/code/lib/Bio/UniProt/GOA.py
new file mode 100644
index 0000000..4beff4c
--- /dev/null
+++ b/code/lib/Bio/UniProt/GOA.py
@@ -0,0 +1,497 @@
+#!/usr/bin/env python
+# Copyright 2013, 2016 by Iddo Friedberg idoerg@gmail.com. All rights reserved.
+# Copyright 2020 by Sergio Valqui. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Parsers for the GAF, GPA and GPI formats from UniProt-GOA.
+
+Uniprot-GOA README + GAF format description:
+ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README
+
+Gene Association File, GAF formats:
+http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/
+http://geneontology.org/docs/go-annotation-file-gaf-format-2.0/
+
+Gene Product Association Data  (GPA format) README:
+http://geneontology.org/docs/gene-product-association-data-gpad-format/
+
+Gene Product Information (GPI format) README:
+http://geneontology.org/docs/gene-product-information-gpi-format/
+
+Go Annotation files are located here:
+ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/
+"""
+
+
+import copy
+
+# GAF: GO Annotation Format
+#
+# GAF version 2.0
+
+GAF20FIELDS = [
+    "DB",
+    "DB_Object_ID",
+    "DB_Object_Symbol",
+    "Qualifier",
+    "GO_ID",
+    "DB:Reference",
+    "Evidence",
+    "With",
+    "Aspect",
+    "DB_Object_Name",
+    "Synonym",
+    "DB_Object_Type",
+    "Taxon_ID",
+    "Date",
+    "Assigned_By",
+    "Annotation_Extension",
+    "Gene_Product_Form_ID",
+]
+
+# GAF version 1.0
+GAF10FIELDS = [
+    "DB",
+    "DB_Object_ID",
+    "DB_Object_Symbol",
+    "Qualifier",
+    "GO_ID",
+    "DB:Reference",
+    "Evidence",
+    "With",
+    "Aspect",
+    "DB_Object_Name",
+    "Synonym",
+    "DB_Object_Type",
+    "Taxon_ID",
+    "Date",
+    "Assigned_By",
+]
+
+# GPA version 1.0
+GPA10FIELDS = [
+    "DB",
+    "DB_Object_ID",
+    "Qualifier",
+    "GO_ID",
+    "DB:Reference",
+    "Evidence code",
+    "With",
+    "Interacting_taxon_ID",
+    "Date",
+    "Assigned_by",
+    "Annotation_Extension",
+    "Spliceform_ID",
+]
+
+# GPA version 1.1
+GPA11FIELDS = [
+    "DB",
+    "DB_Object_ID",
+    "Qualifier",
+    "GO_ID",
+    "DB:Reference",
+    "ECO_Evidence_code",
+    "With",
+    "Interacting_taxon_ID",
+    "Date",
+    "Assigned_by",
+    "Annotation Extension",
+    "Annotation_Properties",
+]
+
+# GPI version 1.0
+GPI10FIELDS = [
+    "DB",
+    "DB_subset",
+    "DB_Object_ID",
+    "DB_Object_Symbol",
+    "DB_Object_Name",
+    "DB_Object_Synonym",
+    "DB_Object_Type",
+    "Taxon",
+    "Annotation_Target_Set",
+    "Annotation_Completed",
+    "Parent_Object_ID",
+]
+
+# GPI version 1.1
+GPI11FIELDS = [
+    "DB_Object_ID",
+    "DB_Object_Symbol",
+    "DB_Object_Name",
+    "DB_Object_Synonym",
+    "DB_Object_Type",
+    "Taxon",
+    "Parent_Object_ID",
+    "DB_Xref",
+    "Gene_Product_Properties",
+]
+
+# GPI version 1.2
+GPI12FIELDS = [
+    "DB",
+    "DB_Object_ID",
+    "DB_Object_Symbol",
+    "DB_Object_Name",
+    "DB_Object_Synonym",
+    "DB_Object_Type",
+    "Taxon",
+    "Parent_Object_ID",
+    "DB_Xref",
+    "Gene_Product_Properties",
+]
+
+
+def _gpi10iterator(handle):
+    """Read GPI 1.0 format files (PRIVATE).
+
+    This iterator is used to read a gp_information.goa_uniprot
+    file which is in the GPI 1.0 format.
+    """
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[5] = inrec[5].split("|")  # DB_Object_Synonym(s)
+        inrec[8] = inrec[8].split("|")  # Annotation_Target_Set
+        yield dict(zip(GPI10FIELDS, inrec))
+
+
+def _gpi11iterator(handle):
+    """Read GPI 1.1 format files (PRIVATE).
+
+    This iterator is used to read a gp_information.goa_uniprot
+    file which is in the GPI 1.1 format.
+    """
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[2] = inrec[2].split("|")  # DB_Object_Name
+        inrec[3] = inrec[3].split("|")  # DB_Object_Synonym(s)
+        inrec[7] = inrec[7].split("|")  # DB_Xref(s)
+        inrec[8] = inrec[8].split("|")  # Properties
+        yield dict(zip(GPI11FIELDS, inrec))
+
+
+def _gpi12iterator(handle):
+    """Read GPI 1.2 format files (PRIVATE).
+
+    This iterator is used to read a gp_information.goa_uniprot
+    file which is in the GPI 1.2 format.
+    """
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[3] = inrec[3].split("|")  # DB_Object_Name
+        inrec[4] = inrec[4].split("|")  # DB_Object_Synonym(s)
+        inrec[8] = inrec[8].split("|")  # DB_Xref(s)
+        inrec[9] = inrec[9].split("|")  # Properties
+        yield dict(zip(GPI12FIELDS, inrec))
+
+
+def gpi_iterator(handle):
+    """Read GPI format files.
+
+    This function should be called to read a
+    gp_information.goa_uniprot file. At the moment, there is
+    only one format, but this may change, so
+    this function is a placeholder a future wrapper.
+    """
+    inline = handle.readline()
+    if inline.strip() == "!gpi-version: 1.2":
+        return _gpi12iterator(handle)
+    elif inline.strip() == "!gpi-version: 1.1":
+        # sys.stderr.write("gpi 1.1\n")
+        return _gpi11iterator(handle)
+    elif inline.strip() == "!gpi-version: 1.0":
+        # sys.stderr.write("gpi 1.0\n")
+        return _gpi10iterator(handle)
+    elif inline.strip() == "!gpi-version: 2.1":
+        # sys.stderr.write("gpi 2.1\n")
+        # return _gpi20iterator(handle)
+        raise NotImplementedError("Sorry, parsing GPI version 2 not implemented yet.")
+    else:
+        raise ValueError(f"Unknown GPI version {inline}\n")
+
+
+def _gpa10iterator(handle):
+    """Read GPA 1.0 format files (PRIVATE).
+
+    This iterator is used to read a gp_association.*
+    file which is in the GPA 1.0 format. Do not call directly. Rather,
+    use the gpaiterator function.
+    """
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[2] = inrec[2].split("|")  # Qualifier
+        inrec[4] = inrec[4].split("|")  # DB:Reference(s)
+        inrec[6] = inrec[6].split("|")  # With
+        inrec[10] = inrec[10].split("|")  # Annotation extension
+        yield dict(zip(GPA10FIELDS, inrec))
+
+
+def _gpa11iterator(handle):
+    """Read GPA 1.1 format files (PRIVATE).
+
+    This iterator is used to read a gp_association.goa_uniprot
+    file which is in the GPA 1.1 format. Do not call directly. Rather
+    use the gpa_iterator function
+    """
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[2] = inrec[2].split("|")  # Qualifier
+        inrec[4] = inrec[4].split("|")  # DB:Reference(s)
+        inrec[6] = inrec[6].split("|")  # With
+        inrec[10] = inrec[10].split("|")  # Annotation extension
+        yield dict(zip(GPA11FIELDS, inrec))
+
+
+def gpa_iterator(handle):
+    """Read GPA format files.
+
+    This function should be called to read a
+    gene_association.goa_uniprot file. Reads the first record and
+    returns a gpa 1.1 or a gpa 1.0 iterator as needed
+    """
+    inline = handle.readline()
+    if inline.strip() == "!gpa-version: 1.1":
+        # sys.stderr.write("gpa 1.1\n")
+        return _gpa11iterator(handle)
+    elif inline.strip() == "!gpa-version: 1.0":
+        # sys.stderr.write("gpa 1.0\n")
+        return _gpa10iterator(handle)
+    else:
+        raise ValueError(f"Unknown GPA version {inline}\n")
+
+
+def _gaf20iterator(handle):
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[3] = inrec[3].split("|")  # Qualifier
+        inrec[5] = inrec[5].split("|")  # DB:reference(s)
+        inrec[7] = inrec[7].split("|")  # With || From
+        inrec[10] = inrec[10].split("|")  # Synonym
+        inrec[12] = inrec[12].split("|")  # Taxon
+        yield dict(zip(GAF20FIELDS, inrec))
+
+
+def _gaf10iterator(handle):
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[3] = inrec[3].split("|")  # Qualifier
+        inrec[5] = inrec[5].split("|")  # DB:reference(s)
+        inrec[7] = inrec[7].split("|")  # With || From
+        inrec[10] = inrec[10].split("|")  # Synonym
+        inrec[12] = inrec[12].split("|")  # Taxon
+        yield dict(zip(GAF10FIELDS, inrec))
+
+
+def _gaf10byproteiniterator(handle):
+    cur_id = None
+    id_rec_list = []
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[3] = inrec[3].split("|")  # Qualifier
+        inrec[5] = inrec[5].split("|")  # DB:reference(s)
+        inrec[7] = inrec[7].split("|")  # With || From
+        inrec[10] = inrec[10].split("|")  # Synonym
+        inrec[12] = inrec[12].split("|")  # Taxon
+        cur_rec = dict(zip(GAF10FIELDS, inrec))
+        if cur_rec["DB_Object_ID"] != cur_id and cur_id:
+            ret_list = copy.copy(id_rec_list)
+            id_rec_list = [cur_rec]
+            cur_id = cur_rec["DB_Object_ID"]
+            yield ret_list
+        else:
+            cur_id = cur_rec["DB_Object_ID"]
+            id_rec_list.append(cur_rec)
+
+
+def _gaf20byproteiniterator(handle):
+    cur_id = None
+    id_rec_list = []
+    for inline in handle:
+        if inline[0] == "!":
+            continue
+        inrec = inline.rstrip("\n").split("\t")
+        if len(inrec) == 1:
+            continue
+        inrec[3] = inrec[3].split("|")  # Qualifier
+        inrec[5] = inrec[5].split("|")  # DB:reference(s)
+        inrec[7] = inrec[7].split("|")  # With || From
+        inrec[10] = inrec[10].split("|")  # Synonym
+        inrec[12] = inrec[12].split("|")  # Taxon
+        cur_rec = dict(zip(GAF20FIELDS, inrec))
+        if cur_rec["DB_Object_ID"] != cur_id and cur_id:
+            ret_list = copy.copy(id_rec_list)
+            id_rec_list = [cur_rec]
+            cur_id = cur_rec["DB_Object_ID"]
+            yield ret_list
+        else:
+            cur_id = cur_rec["DB_Object_ID"]
+            id_rec_list.append(cur_rec)
+
+
+def gafbyproteiniterator(handle):
+    """Iterate over records in a gene association file.
+
+    Returns a list of all consecutive records with the same DB_Object_ID
+    This function should be called to read a
+    gene_association.goa_uniprot file. Reads the first record and
+    returns a gaf 2.0 or a gaf 1.0 iterator as needed
+    2016-04-09: added GAF 2.1 iterator & fixed bug in iterator assignment
+    In the meantime GAF 2.1 uses the GAF 2.0 iterator
+    """
+    inline = handle.readline()
+    if inline.strip() == "!gaf-version: 2.0":
+        # sys.stderr.write("gaf 2.0\n")
+        return _gaf20byproteiniterator(handle)
+    elif inline.strip() == "!gaf-version: 1.0":
+        # sys.stderr.write("gaf 1.0\n")
+        return _gaf10byproteiniterator(handle)
+    elif inline.strip() == "!gaf-version: 2.1":
+        # Handle GAF 2.1 as GAF 2.0 for now TODO: fix
+        # sys.stderr.write("gaf 2.1\n")
+        return _gaf20byproteiniterator(handle)
+    else:
+        raise ValueError(f"Unknown GAF version {inline}\n")
+
+
+def gafiterator(handle):
+    """Iterate over a GAF 1.0 or 2.0 file.
+
+    This function should be called to read a
+    gene_association.goa_uniprot file. Reads the first record and
+    returns a gaf 2.0 or a gaf 1.0 iterator as needed
+
+    Example: open, read, interat and filter results.
+
+    Original data file has been trimed to ~600 rows.
+
+    Original source ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/YEAST/goa_yeast.gaf.gz
+
+    >>> from Bio.UniProt.GOA import gafiterator, record_has
+    >>> Evidence = {'Evidence': set(['ND'])}
+    >>> Synonym = {'Synonym': set(['YA19A_YEAST', 'YAL019W-A'])}
+    >>> Taxon_ID = {'Taxon_ID': set(['taxon:559292'])}
+    >>> with open('UniProt/goa_yeast.gaf', 'r') as handle:
+    ...     for rec in gafiterator(handle):
+    ...         if record_has(rec, Taxon_ID) and record_has(rec, Evidence) and record_has(rec, Synonym):
+    ...             for key in ('DB_Object_Name', 'Evidence', 'Synonym', 'Taxon_ID'):
+    ...                 print(rec[key])
+    ...
+    Putative uncharacterized protein YAL019W-A
+    ND
+    ['YA19A_YEAST', 'YAL019W-A']
+    ['taxon:559292']
+    Putative uncharacterized protein YAL019W-A
+    ND
+    ['YA19A_YEAST', 'YAL019W-A']
+    ['taxon:559292']
+    Putative uncharacterized protein YAL019W-A
+    ND
+    ['YA19A_YEAST', 'YAL019W-A']
+    ['taxon:559292']
+
+    """
+    inline = handle.readline()
+    if inline.strip() == "!gaf-version: 2.0":
+        # sys.stderr.write("gaf 2.0\n")
+        return _gaf20iterator(handle)
+    elif inline.strip() == "!gaf-version: 2.1":
+        # sys.stderr.write("gaf 2.1\n")
+        # Handle GAF 2.1 as GAF 2.0 for now. TODO: fix
+        return _gaf20iterator(handle)
+    elif inline.strip() == "!gaf-version: 1.0":
+        # sys.stderr.write("gaf 1.0\n")
+        return _gaf10iterator(handle)
+    else:
+        raise ValueError(f"Unknown GAF version {inline}\n")
+
+
+def writerec(outrec, handle, fields=GAF20FIELDS):
+    """Write a single UniProt-GOA record to an output stream.
+
+    Caller should know the  format version. Default: gaf-2.0
+    If header has a value, then it is assumed this is the first record,
+    a header is written.
+    """
+    outstr = ""
+    for field in fields[:-1]:
+        if isinstance(outrec[field], list):
+            for subfield in outrec[field]:
+                outstr += subfield + "|"
+            outstr = outstr[:-1] + "\t"
+        else:
+            outstr += outrec[field] + "\t"
+    outstr += outrec[fields[-1]] + "\n"
+    handle.write(outstr)
+
+
+def writebyproteinrec(outprotrec, handle, fields=GAF20FIELDS):
+    """Write a list of GAF records to an output stream.
+
+    Caller should know the  format version. Default: gaf-2.0
+    If header has a value, then it is assumed this is the first record,
+    a header is written. Typically the list is the one read by fafbyproteinrec, which
+    contains all consecutive lines with the same DB_Object_ID
+    """
+    for outrec in outprotrec:
+        writerec(outrec, handle, fields=fields)
+
+
+def record_has(inrec, fieldvals):
+    """Accept a record, and a dictionary of field values.
+
+    The format is {'field_name': set([val1, val2])}.
+    If any field in the record has  a matching value, the function returns
+    True. Otherwise, returns False.
+    """
+    retval = False
+    for field in fieldvals:
+        if isinstance(inrec[field], str):
+            set1 = {inrec[field]}
+        else:
+            set1 = set(inrec[field])
+        if set1 & fieldvals[field]:
+            retval = True
+            break
+    return retval
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/UniProt/__init__.py b/code/lib/Bio/UniProt/__init__.py
new file mode 100644
index 0000000..56f36f9
--- /dev/null
+++ b/code/lib/Bio/UniProt/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2013 by Iddo Friedberg idoerg@gmail.com
+# Revision copyright 2013 by Peter Cock.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code for dealing with assorted UniProt file formats.
+
+This currently include parsers for the GAF, GPA and GPI formats
+from UniProt-GOA as the module Bio.UniProt.GOA.
+
+See also Bio.SwissProt and the "swiss" support in Bio.SeqIO for
+the legacy plain text sequence format still used in UniProt.
+
+See also Bio.SeqIO.SwissIO for the "uniprot-xml" support in
+Bio.SeqIO.
+"""
diff --git a/code/lib/Bio/UniProt/__pycache__/GOA.cpython-37.pyc b/code/lib/Bio/UniProt/__pycache__/GOA.cpython-37.pyc
new file mode 100644
index 0000000..56dadb3
Binary files /dev/null and b/code/lib/Bio/UniProt/__pycache__/GOA.cpython-37.pyc differ
diff --git a/code/lib/Bio/UniProt/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/UniProt/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..b96cb5a
Binary files /dev/null and b/code/lib/Bio/UniProt/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Wise/__init__.py b/code/lib/Bio/Wise/__init__.py
new file mode 100644
index 0000000..4841b08
--- /dev/null
+++ b/code/lib/Bio/Wise/__init__.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# Copyright 2004-2005 by Michael Hoffman. All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Run and process output from the Wise2 package tools.
+
+Bio.Wise contains modules for running and processing the output of
+some of the models in the Wise2 package by Ewan Birney available from:
+ftp://ftp.ebi.ac.uk/pub/software/unix/wise2/
+http://www.ebi.ac.uk/Wise2/
+
+Bio.Wise.psw is for protein Smith-Waterman alignments
+Bio.Wise.dnal is for Smith-Waterman DNA alignments
+"""
+
+
+import os
+import sys
+import tempfile
+
+from Bio import SeqIO
+
+
+def _build_align_cmdline(
+    cmdline, pair, output_filename, kbyte=None, force_type=None, quiet=False
+):
+    """Build a command line string (PRIVATE).
+
+    >>> os.environ["WISE_KBYTE"]="300000"
+    >>> if os.isatty(sys.stderr.fileno()):
+    ...    c = _build_align_cmdline(["dnal"], ("seq1.fna", "seq2.fna"),
+    ...                             "/tmp/output", kbyte=100000)
+    ...    assert c == 'dnal -kbyte 100000 seq1.fna seq2.fna > /tmp/output', c
+    ...    c = _build_align_cmdline(["psw"], ("seq1.faa", "seq2.faa"),
+    ...                             "/tmp/output_aa")
+    ...    assert c == 'psw -kbyte 300000 seq1.faa seq2.faa > /tmp/output_aa', c
+    ... else:
+    ...    c = _build_align_cmdline(["dnal"], ("seq1.fna", "seq2.fna"),
+    ...                             "/tmp/output", kbyte=100000)
+    ...    assert c == 'dnal -kbyte 100000 -quiet seq1.fna seq2.fna > /tmp/output', c
+    ...    c = _build_align_cmdline(["psw"], ("seq1.faa", "seq2.faa"),
+    ...                             "/tmp/output_aa")
+    ...    assert c == 'psw -kbyte 300000 -quiet seq1.faa seq2.faa > /tmp/output_aa', c
+
+    """
+    cmdline = cmdline[:]
+
+    # XXX: force_type ignored
+
+    if kbyte is None:
+        try:
+            cmdline.extend(("-kbyte", os.environ["WISE_KBYTE"]))
+        except KeyError:
+            pass
+    else:
+        cmdline.extend(("-kbyte", str(kbyte)))
+
+    if not os.isatty(sys.stderr.fileno()):
+        cmdline.append("-quiet")
+
+    cmdline.extend(pair)
+    cmdline.extend((">", output_filename))
+    if quiet:
+        cmdline.extend(("2>", "/dev/null"))
+    return " ".join(cmdline)
+
+
+def align(
+    cmdline, pair, kbyte=None, force_type=None, dry_run=False, quiet=False, debug=False
+):
+    """Run an alignment. Returns a filehandle."""
+    if not pair or len(pair) != 2:
+        raise ValueError("Expected pair of filename, not %r" % pair)
+
+    output_file = tempfile.NamedTemporaryFile(mode="r")
+    input_files = (
+        tempfile.NamedTemporaryFile(mode="w"),
+        tempfile.NamedTemporaryFile(mode="w"),
+    )
+
+    if dry_run:
+        print(
+            _build_align_cmdline(
+                cmdline, pair, output_file.name, kbyte, force_type, quiet
+            )
+        )
+        return
+
+    for filename, input_file in zip(pair, input_files):
+        # Pipe the file through Biopython's Fasta parser/writer
+        # to make sure it conforms to the Fasta standard (in particular,
+        # Wise2 may choke on long lines in the Fasta file)
+        records = SeqIO.parse(open(filename), "fasta")
+        SeqIO.write(records, input_file, "fasta")
+        input_file.flush()
+
+    input_file_names = [input_file.name for input_file in input_files]
+
+    cmdline_str = _build_align_cmdline(
+        cmdline, input_file_names, output_file.name, kbyte, force_type, quiet
+    )
+
+    if debug:
+        sys.stderr.write("%s\n" % cmdline_str)
+
+    status = os.system(cmdline_str) >> 8
+
+    # `status` here will be >1 for error codes >=256
+    if status > 1:
+        if kbyte != 0:  # possible memory problem; could be None
+            sys.stderr.write("INFO trying again with the linear model\n")
+            return align(cmdline, pair, 0, force_type, dry_run, quiet, debug)
+        else:
+            raise OSError("%s returned %s" % (" ".join(cmdline), status))
+
+    return output_file
+
+
+def all_pairs(singles):
+    """Generate pairs list for all-against-all alignments.
+
+    >>> all_pairs(range(4))
+    [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
+    """
+    pairs = []
+
+    singles = list(singles)
+    while singles:
+        suitor = singles.pop(0)  # if sorted, stay sorted
+        pairs.extend((suitor, single) for single in singles)
+
+    return pairs
+
+
+def main():
+    """Provision for command line testing."""
+    pass
+
+
+def _test(*args, **keywds):
+    import doctest
+
+    doctest.testmod(sys.modules[__name__], *args, **keywds)
+
+
+if __name__ == "__main__":
+    if __debug__:
+        _test()
+    main()
diff --git a/code/lib/Bio/Wise/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Wise/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..c3b8976
Binary files /dev/null and b/code/lib/Bio/Wise/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Wise/__pycache__/dnal.cpython-37.pyc b/code/lib/Bio/Wise/__pycache__/dnal.cpython-37.pyc
new file mode 100644
index 0000000..672c992
Binary files /dev/null and b/code/lib/Bio/Wise/__pycache__/dnal.cpython-37.pyc differ
diff --git a/code/lib/Bio/Wise/__pycache__/psw.cpython-37.pyc b/code/lib/Bio/Wise/__pycache__/psw.cpython-37.pyc
new file mode 100644
index 0000000..c4dca6f
Binary files /dev/null and b/code/lib/Bio/Wise/__pycache__/psw.cpython-37.pyc differ
diff --git a/code/lib/Bio/Wise/dnal.py b/code/lib/Bio/Wise/dnal.py
new file mode 100644
index 0000000..745f4f5
--- /dev/null
+++ b/code/lib/Bio/Wise/dnal.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# Copyright 2004-2005 by Michael Hoffman. All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Run and process output from the Wise2 package tool dnal.
+
+Bio.Wise contains modules for running and processing the output of
+some of the models in the Wise2 package by Ewan Birney available from:
+ftp://ftp.ebi.ac.uk/pub/software/unix/wise2/
+http://www.ebi.ac.uk/Wise2/
+
+Bio.Wise.psw is for protein Smith-Waterman alignments
+Bio.Wise.dnal is for Smith-Waterman DNA alignments
+"""
+
+
+import re
+
+# Importing with leading underscore as not intended to be exposed
+from subprocess import getoutput as _getoutput
+
+from Bio import Wise
+
+
+_SCORE_MATCH = 4
+_SCORE_MISMATCH = -1
+_SCORE_GAP_START = -5
+_SCORE_GAP_EXTENSION = -1
+
+_CMDLINE_DNAL = ["dnal", "-alb", "-nopretty"]
+
+
+def _build_dnal_cmdline(match, mismatch, gap, extension):
+    res = _CMDLINE_DNAL[:]
+    res.extend(["-match", str(match)])
+    res.extend(["-mis", str(mismatch)])
+    res.extend(["-gap", str(-gap)])  # negative: convert score to penalty
+    res.extend(["-ext", str(-extension)])  # negative: convert score to penalty
+
+    return res
+
+
+_CMDLINE_FGREP_COUNT = "fgrep -c '%s' %s"
+
+
+def _fgrep_count(pattern, file):
+    return int(_getoutput(_CMDLINE_FGREP_COUNT % (pattern, file)))
+
+
+_re_alb_line2coords = re.compile(r"^\[([^:]+):[^\[]+\[([^:]+):")
+
+
+def _alb_line2coords(line):
+    return tuple(
+        int(coord) + 1  # one-based -> zero-based
+        for coord in _re_alb_line2coords.match(line).groups()
+    )
+
+
+def _get_coords(filename):
+    alb = open(filename)
+
+    start_line = None
+    end_line = None
+
+    for line in alb:
+        if line.startswith("["):
+            if not start_line:
+                start_line = line  # rstrip not needed
+            else:
+                end_line = line
+
+    if end_line is None:  # sequence is too short
+        return [(0, 0), (0, 0)]
+
+    return list(
+        zip(*map(_alb_line2coords, [start_line, end_line]))
+    )  # returns [(start0, end0), (start1, end1)]
+
+
+class Statistics:
+    """Calculate statistics from an ALB report."""
+
+    def __init__(self, filename, match, mismatch, gap, extension):
+        """Initialize the class."""
+        self.matches = _fgrep_count('"SEQUENCE" %s' % match, filename)
+        self.mismatches = _fgrep_count('"SEQUENCE" %s' % mismatch, filename)
+        self.gaps = _fgrep_count('"INSERT" %s' % gap, filename)
+
+        if gap == extension:
+            self.extensions = 0
+        else:
+            self.extensions = _fgrep_count('"INSERT" %s' % extension, filename)
+
+        self.score = (
+            match * self.matches
+            + mismatch * self.mismatches
+            + gap * self.gaps
+            + extension * self.extensions
+        )
+
+        if self.matches or self.mismatches or self.gaps or self.extensions:
+            self.coords = _get_coords(filename)
+        else:
+            self.coords = [(0, 0), (0, 0)]
+
+    def identity_fraction(self):
+        """Calculate the fraction of matches."""
+        return self.matches / (self.matches + self.mismatches)
+
+    header = "identity_fraction\tmatches\tmismatches\tgaps\textensions"
+
+    def __str__(self):
+        """Statistics as a tab separated string."""
+        return "\t".join(
+            str(x)
+            for x in (
+                self.identity_fraction(),
+                self.matches,
+                self.mismatches,
+                self.gaps,
+                self.extensions,
+            )
+        )
+
+
+def align(
+    pair,
+    match=_SCORE_MATCH,
+    mismatch=_SCORE_MISMATCH,
+    gap=_SCORE_GAP_START,
+    extension=_SCORE_GAP_EXTENSION,
+    **keywds
+):
+    """Align a pair of DNA files using dnal and calculate the statistics of the alignment."""
+    cmdline = _build_dnal_cmdline(match, mismatch, gap, extension)
+    temp_file = Wise.align(cmdline, pair, **keywds)
+    try:
+        return Statistics(temp_file.name, match, mismatch, gap, extension)
+    except AttributeError:
+        try:
+            keywds["dry_run"]
+            return None
+        except KeyError:
+            raise
+
+
+def main():
+    """Command line implementation."""
+    import sys
+
+    stats = align(sys.argv[1:3])
+    print(
+        "\n".join(
+            "%s: %s" % (attr, getattr(stats, attr))
+            for attr in ("matches", "mismatches", "gaps", "extensions")
+        )
+    )
+    print("identity_fraction: %s" % stats.identity_fraction())
+    print("coords: %s" % stats.coords)
+
+
+def _test(*args, **keywds):
+    import doctest
+    import sys
+
+    doctest.testmod(sys.modules[__name__], *args, **keywds)
+
+
+if __name__ == "__main__":
+    if __debug__:
+        _test()
+    main()
diff --git a/code/lib/Bio/Wise/psw.py b/code/lib/Bio/Wise/psw.py
new file mode 100644
index 0000000..2eea52b
--- /dev/null
+++ b/code/lib/Bio/Wise/psw.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+# Copyright 2004 by Michael Hoffman. All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Run and process output from the Wise2 package tool psw.
+
+Bio.Wise contains modules for running and processing the output of
+some of the models in the Wise2 package by Ewan Birney available from:
+ftp://ftp.ebi.ac.uk/pub/software/unix/wise2/
+http://www.ebi.ac.uk/Wise2/
+
+Bio.Wise.psw is for protein Smith-Waterman alignments
+Bio.Wise.dnal is for Smith-Waterman DNA alignments
+"""
+
+
+import os
+import re
+import sys
+
+from Bio import Wise
+
+
+_CMDLINE_PSW = ["psw", "-l", "-F"]
+_OPTION_GAP_START = "-g"
+_OPTION_GAP_EXTENSION = "-e"
+_OPTION_SCORES = "-m"
+
+
+class AlignmentColumnFullException(Exception):
+    """Manage exception in the alignment output."""
+
+    pass
+
+
+class Alignment(list):
+    """Define a container for all alignment Columns, output from running psw."""
+
+    def append(self, column_unit):
+        """Add an alignment Column to Alignment."""
+        try:
+            self[-1].append(column_unit)
+        except AlignmentColumnFullException:
+            list.append(self, AlignmentColumn(column_unit))
+        except IndexError:
+            list.append(self, AlignmentColumn(column_unit))
+
+
+class AlignmentColumn(list):
+    """Define a container for the units that made the Column."""
+
+    def _set_kind(self, column_unit):
+        if self.kind == "SEQUENCE":
+            self.kind = column_unit.kind
+
+    def __init__(self, column_unit):
+        """Initialize the class."""
+        assert column_unit.unit == 0
+        self.kind = column_unit.kind
+        list.__init__(self, [column_unit.column, None])
+
+    def __repr__(self):
+        """Represent the AlignmentColumn object as a string for debugging."""
+        return "%s(%r, %r)" % (self.kind, self[0], self[1])
+
+    def append(self, column_unit):
+        """Add a unit to the Column."""
+        if self[1] is not None:
+            raise AlignmentColumnFullException
+
+        assert column_unit.unit == 1
+
+        self._set_kind(column_unit)
+        self[1] = column_unit.column
+
+
+class ColumnUnit:
+    """Define a container for the details of each sequence alignment."""
+
+    def __init__(self, unit, column, kind):
+        """Initialize the class."""
+        self.unit = unit
+        self.column = column
+        self.kind = kind
+
+    def __repr__(self):
+        """Represent the ColumnUnit object as a string for debugging."""
+        return "ColumnUnit(unit=%r, column=%r, kind=%r)" % (
+            self.unit,
+            self.column,
+            self.kind,
+        )
+
+
+_re_unit = re.compile(r"^Unit +([01])- \[ *(-?\d+)- *(-?\d+)\] \[(\w+)\]$")
+
+
+def parse_line(line):
+    """Parse a line from psw.
+
+    >>> print(parse_line("Column 0:"))
+    None
+    >>> parse_line("Unit  0- [  -1-   0] [SEQUENCE]")
+    ColumnUnit(unit=0, column=0, kind='SEQUENCE')
+    >>> parse_line("Unit  1- [  85-  86] [SEQUENCE]")
+    ColumnUnit(unit=1, column=86, kind='SEQUENCE')
+    """
+    match = _re_unit.match(line.rstrip())
+
+    if not match:
+        return
+
+    return ColumnUnit(int(match.group(1)), int(match.group(3)), match.group(4))
+
+
+def parse(iterable):
+    """Parse a file.
+
+    format
+
+    Column 0:
+    Unit  0- [  -1-   0] [SEQUENCE]
+    Unit  1- [  85-  86] [SEQUENCE]
+
+    means that seq1[0] == seq2[86] (0-based)
+    """
+    alignment = Alignment()
+    for line in iterable:
+        try:
+            if os.environ["WISE_PY_DEBUG"]:
+                print(line)
+        except KeyError:
+            pass
+
+        column_unit = parse_line(line)
+        if column_unit:
+            alignment.append(column_unit)
+
+    return alignment
+
+
+def align(pair, scores=None, gap_start=None, gap_extension=None, *args, **keywds):
+    """Align a pair of DNA files using Wise2 psw."""
+    cmdline = _CMDLINE_PSW[:]
+    if scores:
+        cmdline.extend((_OPTION_SCORES, scores))
+    if gap_start:
+        cmdline.extend((_OPTION_GAP_START, str(gap_start)))
+    if gap_extension:
+        cmdline.extend((_OPTION_GAP_EXTENSION, str(gap_extension)))
+    temp_file = Wise.align(cmdline, pair, *args, **keywds)
+    return parse(temp_file)
+
+
+def main():
+    """Command line implementation."""
+    print(align(sys.argv[1:3]))
+
+
+def _test(*args, **keywds):
+    import doctest
+
+    doctest.testmod(sys.modules[__name__], *args, **keywds)
+
+
+if __name__ == "__main__":
+    if __debug__:
+        _test()
+        """Initialize the class."""
+    main()
diff --git a/code/lib/Bio/__init__.py b/code/lib/Bio/__init__.py
new file mode 100644
index 0000000..3eaea3b
--- /dev/null
+++ b/code/lib/Bio/__init__.py
@@ -0,0 +1,129 @@
+# Copyright 1999-2003 by Jeffrey Chang.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Collection of modules for dealing with biological data in Python.
+
+The Biopython Project is an international association of developers
+of freely available Python tools for computational molecular biology.
+
+https://biopython.org
+"""
+
+import os
+import warnings
+
+__version__ = "1.79"
+
+
+class MissingExternalDependencyError(Exception):
+    """Missing an external dependency.
+
+    Used for things like missing command line tools. Important for our unit
+    tests to allow skipping tests with missing external dependencies.
+    """
+
+
+class MissingPythonDependencyError(MissingExternalDependencyError, ImportError):
+    """Missing an external python dependency (subclass of ImportError).
+
+    Used for missing Python modules (rather than just a typical ImportError).
+    Important for our unit tests to allow skipping tests with missing external
+    python dependencies, while also allowing the exception to be caught as an
+    ImportError.
+    """
+
+
+class StreamModeError(ValueError):
+    """Incorrect stream mode (text vs binary).
+
+    This error should be raised when a stream (file or file-like object)
+    argument is in text mode while the receiving function expects binary mode,
+    or vice versa.
+    """
+
+
+class BiopythonWarning(Warning):
+    """Biopython warning.
+
+    Biopython should use this warning (or subclasses of it), making it easy to
+    silence all our warning messages should you wish to:
+
+    >>> import warnings
+    >>> from Bio import BiopythonWarning
+    >>> warnings.simplefilter('ignore', BiopythonWarning)
+
+    Consult the warnings module documentation for more details.
+    """
+
+
+class BiopythonParserWarning(BiopythonWarning):
+    """Biopython parser warning.
+
+    Some in-valid data files cannot be parsed and will trigger an exception.
+    Where a reasonable interpretation is possible, Biopython will issue this
+    warning to indicate a potential problem. To silence these warnings, use:
+
+    >>> import warnings
+    >>> from Bio import BiopythonParserWarning
+    >>> warnings.simplefilter('ignore', BiopythonParserWarning)
+
+    Consult the warnings module documentation for more details.
+    """
+
+
+class BiopythonDeprecationWarning(BiopythonWarning):
+    """Biopython deprecation warning.
+
+    Biopython uses this warning instead of the built in DeprecationWarning
+    since those are ignored by default since Python 2.7.
+
+    To silence all our deprecation warning messages, use:
+
+    >>> import warnings
+    >>> from Bio import BiopythonDeprecationWarning
+    >>> warnings.simplefilter('ignore', BiopythonDeprecationWarning)
+
+    Code marked as deprecated is likely to be removed in a future version
+    of Biopython. To avoid removal of this code, please contact the Biopython
+    developers via the mailing list or GitHub.
+    """
+
+
+class BiopythonExperimentalWarning(BiopythonWarning):
+    """Biopython experimental code warning.
+
+    Biopython uses this warning for experimental code ('alpha' or 'beta'
+    level code) which is released as part of the standard releases to mark
+    sub-modules or functions for early adopters to test & give feedback.
+
+    Code issuing this warning is likely to change (or even be removed) in
+    a subsequent release of Biopython. Such code should NOT be used for
+    production/stable code. It should only be used if:
+
+    - You are running the latest release of Biopython, or ideally the
+      latest code from our repository.
+    - You are subscribed to the biopython-dev mailing list to provide
+      feedback on this code, and to be alerted of changes to it.
+
+    If all goes well, experimental code would be promoted to stable in
+    a subsequent release, and this warning removed from it.
+    """
+
+
+_parent_dir = os.path.dirname(os.path.dirname(__file__))
+if os.path.exists(os.path.join(_parent_dir, "setup.py")):
+    warnings.warn(
+        "You may be importing Biopython from inside the source tree."
+        " This is bad practice and might lead to downstream issues."
+        " In particular, you might encounter ImportErrors due to"
+        " missing compiled C extensions. We recommend that you"
+        " try running your code from outside the source tree."
+        " If you are outside the source tree then you have a"
+        " setup.py file in an unexpected directory: " + _parent_dir,
+        BiopythonWarning,
+    )
+# See #PR 2007 and issue #1991 for discussion on this warning:
+# https://github.com/biopython/biopython/pull/2007
diff --git a/code/lib/Bio/__pycache__/File.cpython-37.pyc b/code/lib/Bio/__pycache__/File.cpython-37.pyc
new file mode 100644
index 0000000..8b175f4
Binary files /dev/null and b/code/lib/Bio/__pycache__/File.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/LogisticRegression.cpython-37.pyc b/code/lib/Bio/__pycache__/LogisticRegression.cpython-37.pyc
new file mode 100644
index 0000000..32fa1c4
Binary files /dev/null and b/code/lib/Bio/__pycache__/LogisticRegression.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/MarkovModel.cpython-37.pyc b/code/lib/Bio/__pycache__/MarkovModel.cpython-37.pyc
new file mode 100644
index 0000000..9666934
Binary files /dev/null and b/code/lib/Bio/__pycache__/MarkovModel.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/MaxEntropy.cpython-37.pyc b/code/lib/Bio/__pycache__/MaxEntropy.cpython-37.pyc
new file mode 100644
index 0000000..fde5d1e
Binary files /dev/null and b/code/lib/Bio/__pycache__/MaxEntropy.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/NaiveBayes.cpython-37.pyc b/code/lib/Bio/__pycache__/NaiveBayes.cpython-37.pyc
new file mode 100644
index 0000000..9a0323b
Binary files /dev/null and b/code/lib/Bio/__pycache__/NaiveBayes.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/Seq.cpython-37.pyc b/code/lib/Bio/__pycache__/Seq.cpython-37.pyc
new file mode 100644
index 0000000..f5b6222
Binary files /dev/null and b/code/lib/Bio/__pycache__/Seq.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/SeqFeature.cpython-37.pyc b/code/lib/Bio/__pycache__/SeqFeature.cpython-37.pyc
new file mode 100644
index 0000000..a003b0b
Binary files /dev/null and b/code/lib/Bio/__pycache__/SeqFeature.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/SeqRecord.cpython-37.pyc b/code/lib/Bio/__pycache__/SeqRecord.cpython-37.pyc
new file mode 100644
index 0000000..a6ae8e3
Binary files /dev/null and b/code/lib/Bio/__pycache__/SeqRecord.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/__init__.cpython-311.pyc b/code/lib/Bio/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000..47befb8
Binary files /dev/null and b/code/lib/Bio/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/lib/Bio/__pycache__/__init__.cpython-312.pyc b/code/lib/Bio/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..036b78d
Binary files /dev/null and b/code/lib/Bio/__pycache__/__init__.cpython-312.pyc differ
diff --git a/code/lib/Bio/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..375f197
Binary files /dev/null and b/code/lib/Bio/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/_utils.cpython-37.pyc b/code/lib/Bio/__pycache__/_utils.cpython-37.pyc
new file mode 100644
index 0000000..fc3fc9d
Binary files /dev/null and b/code/lib/Bio/__pycache__/_utils.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/bgzf.cpython-37.pyc b/code/lib/Bio/__pycache__/bgzf.cpython-37.pyc
new file mode 100644
index 0000000..b27abef
Binary files /dev/null and b/code/lib/Bio/__pycache__/bgzf.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/kNN.cpython-37.pyc b/code/lib/Bio/__pycache__/kNN.cpython-37.pyc
new file mode 100644
index 0000000..1727f0f
Binary files /dev/null and b/code/lib/Bio/__pycache__/kNN.cpython-37.pyc differ
diff --git a/code/lib/Bio/__pycache__/pairwise2.cpython-37.pyc b/code/lib/Bio/__pycache__/pairwise2.cpython-37.pyc
new file mode 100644
index 0000000..ce9eb30
Binary files /dev/null and b/code/lib/Bio/__pycache__/pairwise2.cpython-37.pyc differ
diff --git a/code/lib/Bio/_utils.py b/code/lib/Bio/_utils.py
new file mode 100644
index 0000000..872f4c1
--- /dev/null
+++ b/code/lib/Bio/_utils.py
@@ -0,0 +1,70 @@
+# Copyright 2010 by Eric Talevich. All rights reserved.
+# Copyright 2012 by Wibowo Arindrarto. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Common utility functions for various Bio submodules."""
+
+
+import os
+
+
+def find_test_dir(start_dir=None):
+    """Find the absolute path of Biopython's Tests directory.
+
+    Arguments:
+    start_dir -- Initial directory to begin lookup (default to current dir)
+
+    If the directory is not found up the filesystem's root directory, an
+    exception will be raised.
+
+    """
+    if not start_dir:
+        # no callbacks in function signatures!
+        # defaults to the current directory
+        # (using __file__ would give the installed Biopython)
+        start_dir = "."
+
+    target = os.path.abspath(start_dir)
+    while True:
+        if os.path.isdir(os.path.join(target, "Bio")) and os.path.isdir(
+            os.path.join(target, "Tests")
+        ):
+            # Good, we're in the Biopython root now
+            return os.path.abspath(os.path.join(target, "Tests"))
+        # Recurse up the tree
+        # TODO - Test this on Windows
+        new, tmp = os.path.split(target)
+        if target == new:
+            # Reached root
+            break
+        target = new
+    raise ValueError(
+        "Not within Biopython source tree: %r" % os.path.abspath(start_dir)
+    )
+
+
+def run_doctest(target_dir=None, *args, **kwargs):
+    """Run doctest for the importing module."""
+    import doctest
+
+    # default doctest options
+    default_kwargs = {"optionflags": doctest.ELLIPSIS}
+    kwargs.update(default_kwargs)
+
+    cur_dir = os.path.abspath(os.curdir)
+
+    print("Running doctests...")
+    try:
+        os.chdir(find_test_dir(target_dir))
+        doctest.testmod(*args, **kwargs)
+    finally:
+        # and revert back to initial directory
+        os.chdir(cur_dir)
+    print("Done")
+
+
+if __name__ == "__main__":
+    run_doctest()
diff --git a/code/lib/Bio/bgzf.py b/code/lib/Bio/bgzf.py
new file mode 100644
index 0000000..614964a
--- /dev/null
+++ b/code/lib/Bio/bgzf.py
@@ -0,0 +1,920 @@
+#!/usr/bin/env python
+# Copyright 2010-2018 by Peter Cock.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+r"""Read and write BGZF compressed files (the GZIP variant used in BAM).
+
+The SAM/BAM file format (Sequence Alignment/Map) comes in a plain text
+format (SAM), and a compressed binary format (BAM). The latter uses a
+modified form of gzip compression called BGZF (Blocked GNU Zip Format),
+which can be applied to any file format to provide compression with
+efficient random access. BGZF is described together with the SAM/BAM
+file format at http://samtools.sourceforge.net/SAM1.pdf
+
+Please read the text below about 'virtual offsets' before using BGZF
+files for random access.
+
+Aim of this module
+------------------
+The Python gzip library can be used to read BGZF files, since for
+decompression they are just (specialised) gzip files. What this
+module aims to facilitate is random access to BGZF files (using the
+'virtual offset' idea), and writing BGZF files (which means using
+suitably sized gzip blocks and writing the extra 'BC' field in the
+gzip headers). As in the gzip library, the zlib library is used
+internally.
+
+In addition to being required for random access to and writing of
+BAM files, the BGZF format can also be used on other sequential
+data (in the sense of one record after another), such as most of
+the sequence data formats supported in Bio.SeqIO (like FASTA,
+FASTQ, GenBank, etc) or large MAF alignments.
+
+The Bio.SeqIO indexing functions use this module to support BGZF files.
+
+Technical Introduction to BGZF
+------------------------------
+The gzip file format allows multiple compressed blocks, each of which
+could be a stand alone gzip file. As an interesting bonus, this means
+you can use Unix ``cat`` to combine two or more gzip files into one by
+concatenating them. Also, each block can have one of several compression
+levels (including uncompressed, which actually takes up a little bit
+more space due to the gzip header).
+
+What the BAM designers realised was that while random access to data
+stored in traditional gzip files was slow, breaking the file into
+gzip blocks would allow fast random access to each block. To access
+a particular piece of the decompressed data, you just need to know
+which block it starts in (the offset of the gzip block start), and
+how far into the (decompressed) contents of the block you need to
+read.
+
+One problem with this is finding the gzip block sizes efficiently.
+You can do it with a standard gzip file, but it requires every block
+to be decompressed -- and that would be rather slow. Additionally
+typical gzip files may use very large blocks.
+
+All that differs in BGZF is that compressed size of each gzip block
+is limited to 2^16 bytes, and an extra 'BC' field in the gzip header
+records this size. Traditional decompression tools can ignore this,
+and unzip the file just like any other gzip file.
+
+The point of this is you can look at the first BGZF block, find out
+how big it is from this 'BC' header, and thus seek immediately to
+the second block, and so on.
+
+The BAM indexing scheme records read positions using a 64 bit
+'virtual offset', comprising ``coffset << 16 | uoffset``, where ``coffset``
+is the file offset of the BGZF block containing the start of the read
+(unsigned integer using up to 64-16 = 48 bits), and ``uoffset`` is the
+offset within the (decompressed) block (unsigned 16 bit integer).
+
+This limits you to BAM files where the last block starts by 2^48
+bytes, or 256 petabytes, and the decompressed size of each block
+is at most 2^16 bytes, or 64kb. Note that this matches the BGZF
+'BC' field size which limits the compressed size of each block to
+2^16 bytes, allowing for BAM files to use BGZF with no gzip
+compression (useful for intermediate files in memory to reduce
+CPU load).
+
+Warning about namespaces
+------------------------
+It is considered a bad idea to use "from XXX import ``*``" in Python, because
+it pollutes the namespace. This is a real issue with Bio.bgzf (and the
+standard Python library gzip) because they contain a function called open
+i.e. Suppose you do this:
+
+>>> from Bio.bgzf import *
+>>> print(open.__module__)
+Bio.bgzf
+
+Or,
+
+>>> from gzip import *
+>>> print(open.__module__)
+gzip
+
+Notice that the open function has been replaced. You can "fix" this if you
+need to by importing the built-in open function:
+
+>>> from builtins import open
+
+However, what we recommend instead is to use the explicit namespace, e.g.
+
+>>> from Bio import bgzf
+>>> print(bgzf.open.__module__)
+Bio.bgzf
+
+
+Examples
+--------
+This is an ordinary GenBank file compressed using BGZF, so it can
+be decompressed using gzip,
+
+>>> import gzip
+>>> handle = gzip.open("GenBank/NC_000932.gb.bgz", "r")
+>>> assert 0 == handle.tell()
+>>> line = handle.readline()
+>>> assert 80 == handle.tell()
+>>> line = handle.readline()
+>>> assert 143 == handle.tell()
+>>> data = handle.read(70000)
+>>> assert 70143 == handle.tell()
+>>> handle.close()
+
+We can also access the file using the BGZF reader - but pay
+attention to the file offsets which will be explained below:
+
+>>> handle = BgzfReader("GenBank/NC_000932.gb.bgz", "r")
+>>> assert 0 == handle.tell()
+>>> print(handle.readline().rstrip())
+LOCUS       NC_000932             154478 bp    DNA     circular PLN 15-APR-2009
+>>> assert 80 == handle.tell()
+>>> print(handle.readline().rstrip())
+DEFINITION  Arabidopsis thaliana chloroplast, complete genome.
+>>> assert 143 == handle.tell()
+>>> data = handle.read(70000)
+>>> assert 987828735 == handle.tell()
+>>> print(handle.readline().rstrip())
+f="GeneID:844718"
+>>> print(handle.readline().rstrip())
+     CDS             complement(join(84337..84771,85454..85843))
+>>> offset = handle.seek(make_virtual_offset(55074, 126))
+>>> print(handle.readline().rstrip())
+    68521 tatgtcattc gaaattgtat aaagacaact cctatttaat agagctattt gtgcaagtat
+>>> handle.close()
+
+Notice the handle's offset looks different as a BGZF file. This
+brings us to the key point about BGZF, which is the block structure:
+
+>>> handle = open("GenBank/NC_000932.gb.bgz", "rb")
+>>> for values in BgzfBlocks(handle):
+...     print("Raw start %i, raw length %i; data start %i, data length %i" % values)
+Raw start 0, raw length 15073; data start 0, data length 65536
+Raw start 15073, raw length 17857; data start 65536, data length 65536
+Raw start 32930, raw length 22144; data start 131072, data length 65536
+Raw start 55074, raw length 22230; data start 196608, data length 65536
+Raw start 77304, raw length 14939; data start 262144, data length 43478
+Raw start 92243, raw length 28; data start 305622, data length 0
+>>> handle.close()
+
+In this example the first three blocks are 'full' and hold 65536 bytes
+of uncompressed data. The fourth block isn't full and holds 43478 bytes.
+Finally there is a special empty fifth block which takes 28 bytes on
+disk and serves as an 'end of file' (EOF) marker. If this is missing,
+it is possible your BGZF file is incomplete.
+
+By reading ahead 70,000 bytes we moved into the second BGZF block,
+and at that point the BGZF virtual offsets start to look different
+to a simple offset into the decompressed data as exposed by the gzip
+library.
+
+As an example, consider seeking to the decompressed position 196734.
+Since 196734 = 65536 + 65536 + 65536 + 126 = 65536*3 + 126, this
+is equivalent to jumping the first three blocks (which in this
+specific example are all size 65536 after decompression - which
+does not always hold) and starting at byte 126 of the fourth block
+(after decompression). For BGZF, we need to know the fourth block's
+offset of 55074 and the offset within the block of 126 to get the
+BGZF virtual offset.
+
+>>> print(55074 << 16 | 126)
+3609329790
+>>> print(bgzf.make_virtual_offset(55074, 126))
+3609329790
+
+Thus for this BGZF file, decompressed position 196734 corresponds
+to the virtual offset 3609329790. However, another BGZF file with
+different contents would have compressed more or less efficiently,
+so the compressed blocks would be different sizes. What this means
+is the mapping between the uncompressed offset and the compressed
+virtual offset depends on the BGZF file you are using.
+
+If you are accessing a BGZF file via this module, just use the
+handle.tell() method to note the virtual offset of a position you
+may later want to return to using handle.seek().
+
+The catch with BGZF virtual offsets is while they can be compared
+(which offset comes first in the file), you cannot safely subtract
+them to get the size of the data between them, nor add/subtract
+a relative offset.
+
+Of course you can parse this file with Bio.SeqIO using BgzfReader,
+although there isn't any benefit over using gzip.open(...), unless
+you want to index BGZF compressed sequence files:
+
+>>> from Bio import SeqIO
+>>> handle = BgzfReader("GenBank/NC_000932.gb.bgz")
+>>> record = SeqIO.read(handle, "genbank")
+>>> handle.close()
+>>> print(record.id)
+NC_000932.1
+
+Text Mode
+---------
+
+Like the standard library gzip.open(...), the BGZF code defaults to opening
+files in binary mode.
+
+You can request the file be opened in text mode, but beware that this is hard
+coded to the simple "latin1" (aka "iso-8859-1") encoding (which includes all
+the ASCII characters), which works well with most Western European languages.
+However, it is not fully compatible with the more widely used UTF-8 encoding.
+
+In variable width encodings like UTF-8, some single characters in the unicode
+text output are represented by multiple bytes in the raw binary form. This is
+problematic with BGZF, as we cannot always decode each block in isolation - a
+single unicode character could be split over two blocks. This can even happen
+with fixed width unicode encodings, as the BGZF block size is not fixed.
+
+Therefore, this module is currently restricted to only support single byte
+unicode encodings, such as ASCII, "latin1" (which is a superset of ASCII), or
+potentially other character maps (not implemented).
+
+Furthermore, unlike the default text mode on Python 3, we do not attempt to
+implement universal new line mode. This transforms the various operating system
+new line conventions like Windows (CR LF or "\r\n"), Unix (just LF, "\n"), or
+old Macs (just CR, "\r"), into just LF ("\n"). Here we have the same problem -
+is "\r" at the end of a block an incomplete Windows style new line?
+
+Instead, you will get the CR ("\r") and LF ("\n") characters as is.
+
+If your data is in UTF-8 or any other incompatible encoding, you must use
+binary mode, and decode the appropriate fragments yourself.
+"""
+
+import struct
+import sys
+import zlib
+
+from builtins import open as _open
+
+_bgzf_magic = b"\x1f\x8b\x08\x04"
+_bgzf_header = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00"
+_bgzf_eof = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+_bytes_BC = b"BC"
+
+
+def open(filename, mode="rb"):
+    r"""Open a BGZF file for reading, writing or appending.
+
+    If text mode is requested, in order to avoid multi-byte characters, this is
+    hard coded to use the "latin1" encoding, and "\r" and "\n" are passed as is
+    (without implementing universal new line mode).
+
+    If your data is in UTF-8 or any other incompatible encoding, you must use
+    binary mode, and decode the appropriate fragments yourself.
+    """
+    if "r" in mode.lower():
+        return BgzfReader(filename, mode)
+    elif "w" in mode.lower() or "a" in mode.lower():
+        return BgzfWriter(filename, mode)
+    else:
+        raise ValueError("Bad mode %r" % mode)
+
+
+def make_virtual_offset(block_start_offset, within_block_offset):
+    """Compute a BGZF virtual offset from block start and within block offsets.
+
+    The BAM indexing scheme records read positions using a 64 bit
+    'virtual offset', comprising in C terms:
+
+    block_start_offset << 16 | within_block_offset
+
+    Here block_start_offset is the file offset of the BGZF block
+    start (unsigned integer using up to 64-16 = 48 bits), and
+    within_block_offset within the (decompressed) block (unsigned
+    16 bit integer).
+
+    >>> make_virtual_offset(0, 0)
+    0
+    >>> make_virtual_offset(0, 1)
+    1
+    >>> make_virtual_offset(0, 2**16 - 1)
+    65535
+    >>> make_virtual_offset(0, 2**16)
+    Traceback (most recent call last):
+    ...
+    ValueError: Require 0 <= within_block_offset < 2**16, got 65536
+
+    >>> 65536 == make_virtual_offset(1, 0)
+    True
+    >>> 65537 == make_virtual_offset(1, 1)
+    True
+    >>> 131071 == make_virtual_offset(1, 2**16 - 1)
+    True
+
+    >>> 6553600000 == make_virtual_offset(100000, 0)
+    True
+    >>> 6553600001 == make_virtual_offset(100000, 1)
+    True
+    >>> 6553600010 == make_virtual_offset(100000, 10)
+    True
+
+    >>> make_virtual_offset(2**48, 0)
+    Traceback (most recent call last):
+    ...
+    ValueError: Require 0 <= block_start_offset < 2**48, got 281474976710656
+
+    """
+    if within_block_offset < 0 or within_block_offset >= 65536:
+        raise ValueError(
+            "Require 0 <= within_block_offset < 2**16, got %i" % within_block_offset
+        )
+    if block_start_offset < 0 or block_start_offset >= 281474976710656:
+        raise ValueError(
+            "Require 0 <= block_start_offset < 2**48, got %i" % block_start_offset
+        )
+    return (block_start_offset << 16) | within_block_offset
+
+
+def split_virtual_offset(virtual_offset):
+    """Divides a 64-bit BGZF virtual offset into block start & within block offsets.
+
+    >>> (100000, 0) == split_virtual_offset(6553600000)
+    True
+    >>> (100000, 10) == split_virtual_offset(6553600010)
+    True
+
+    """
+    start = virtual_offset >> 16
+    return start, virtual_offset ^ (start << 16)
+
+
+def BgzfBlocks(handle):
+    """Low level debugging function to inspect BGZF blocks.
+
+    Expects a BGZF compressed file opened in binary read mode using
+    the builtin open function. Do not use a handle from this bgzf
+    module or the gzip module's open function which will decompress
+    the file.
+
+    Returns the block start offset (see virtual offsets), the block
+    length (add these for the start of the next block), and the
+    decompressed length of the blocks contents (limited to 65536 in
+    BGZF), as an iterator - one tuple per BGZF block.
+
+    >>> from builtins import open
+    >>> handle = open("SamBam/ex1.bam", "rb")
+    >>> for values in BgzfBlocks(handle):
+    ...     print("Raw start %i, raw length %i; data start %i, data length %i" % values)
+    Raw start 0, raw length 18239; data start 0, data length 65536
+    Raw start 18239, raw length 18223; data start 65536, data length 65536
+    Raw start 36462, raw length 18017; data start 131072, data length 65536
+    Raw start 54479, raw length 17342; data start 196608, data length 65536
+    Raw start 71821, raw length 17715; data start 262144, data length 65536
+    Raw start 89536, raw length 17728; data start 327680, data length 65536
+    Raw start 107264, raw length 17292; data start 393216, data length 63398
+    Raw start 124556, raw length 28; data start 456614, data length 0
+    >>> handle.close()
+
+    Indirectly we can tell this file came from an old version of
+    samtools because all the blocks (except the final one and the
+    dummy empty EOF marker block) are 65536 bytes.  Later versions
+    avoid splitting a read between two blocks, and give the header
+    its own block (useful to speed up replacing the header). You
+    can see this in ex1_refresh.bam created using samtools 0.1.18:
+
+    samtools view -b ex1.bam > ex1_refresh.bam
+
+    >>> handle = open("SamBam/ex1_refresh.bam", "rb")
+    >>> for values in BgzfBlocks(handle):
+    ...     print("Raw start %i, raw length %i; data start %i, data length %i" % values)
+    Raw start 0, raw length 53; data start 0, data length 38
+    Raw start 53, raw length 18195; data start 38, data length 65434
+    Raw start 18248, raw length 18190; data start 65472, data length 65409
+    Raw start 36438, raw length 18004; data start 130881, data length 65483
+    Raw start 54442, raw length 17353; data start 196364, data length 65519
+    Raw start 71795, raw length 17708; data start 261883, data length 65411
+    Raw start 89503, raw length 17709; data start 327294, data length 65466
+    Raw start 107212, raw length 17390; data start 392760, data length 63854
+    Raw start 124602, raw length 28; data start 456614, data length 0
+    >>> handle.close()
+
+    The above example has no embedded SAM header (thus the first block
+    is very small at just 38 bytes of decompressed data), while the next
+    example does (a larger block of 103 bytes). Notice that the rest of
+    the blocks show the same sizes (they contain the same read data):
+
+    >>> handle = open("SamBam/ex1_header.bam", "rb")
+    >>> for values in BgzfBlocks(handle):
+    ...     print("Raw start %i, raw length %i; data start %i, data length %i" % values)
+    Raw start 0, raw length 104; data start 0, data length 103
+    Raw start 104, raw length 18195; data start 103, data length 65434
+    Raw start 18299, raw length 18190; data start 65537, data length 65409
+    Raw start 36489, raw length 18004; data start 130946, data length 65483
+    Raw start 54493, raw length 17353; data start 196429, data length 65519
+    Raw start 71846, raw length 17708; data start 261948, data length 65411
+    Raw start 89554, raw length 17709; data start 327359, data length 65466
+    Raw start 107263, raw length 17390; data start 392825, data length 63854
+    Raw start 124653, raw length 28; data start 456679, data length 0
+    >>> handle.close()
+
+    """
+    if isinstance(handle, BgzfReader):
+        raise TypeError("Function BgzfBlocks expects a binary handle")
+    data_start = 0
+    while True:
+        start_offset = handle.tell()
+        try:
+            block_length, data = _load_bgzf_block(handle)
+        except StopIteration:
+            break
+        data_len = len(data)
+        yield start_offset, block_length, data_start, data_len
+        data_start += data_len
+
+
+def _load_bgzf_block(handle, text_mode=False):
+    """Load the next BGZF block of compressed data (PRIVATE).
+
+    Returns a tuple (block size and data), or at end of file
+    will raise StopIteration.
+    """
+    magic = handle.read(4)
+    if not magic:
+        # End of file - should we signal this differently now?
+        # See https://www.python.org/dev/peps/pep-0479/
+        raise StopIteration
+    if magic != _bgzf_magic:
+        raise ValueError(
+            r"A BGZF (e.g. a BAM file) block should start with "
+            r"%r, not %r; handle.tell() now says %r"
+            % (_bgzf_magic, magic, handle.tell())
+        )
+    gzip_mod_time, gzip_extra_flags, gzip_os, extra_len = struct.unpack(
+        ">> from builtins import open
+    >>> handle = open("SamBam/ex1.bam", "rb")
+    >>> for values in BgzfBlocks(handle):
+    ...     print("Raw start %i, raw length %i; data start %i, data length %i" % values)
+    Raw start 0, raw length 18239; data start 0, data length 65536
+    Raw start 18239, raw length 18223; data start 65536, data length 65536
+    Raw start 36462, raw length 18017; data start 131072, data length 65536
+    Raw start 54479, raw length 17342; data start 196608, data length 65536
+    Raw start 71821, raw length 17715; data start 262144, data length 65536
+    Raw start 89536, raw length 17728; data start 327680, data length 65536
+    Raw start 107264, raw length 17292; data start 393216, data length 63398
+    Raw start 124556, raw length 28; data start 456614, data length 0
+    >>> handle.close()
+
+    Now let's see how to use this block information to jump to
+    specific parts of the decompressed BAM file:
+
+    >>> handle = BgzfReader("SamBam/ex1.bam", "rb")
+    >>> assert 0 == handle.tell()
+    >>> magic = handle.read(4)
+    >>> assert 4 == handle.tell()
+
+    So far nothing so strange, we got the magic marker used at the
+    start of a decompressed BAM file, and the handle position makes
+    sense. Now however, let's jump to the end of this block and 4
+    bytes into the next block by reading 65536 bytes,
+
+    >>> data = handle.read(65536)
+    >>> len(data)
+    65536
+    >>> assert 1195311108 == handle.tell()
+
+    Expecting 4 + 65536 = 65540 were you? Well this is a BGZF 64-bit
+    virtual offset, which means:
+
+    >>> split_virtual_offset(1195311108)
+    (18239, 4)
+
+    You should spot 18239 as the start of the second BGZF block, while
+    the 4 is the offset into this block. See also make_virtual_offset,
+
+    >>> make_virtual_offset(18239, 4)
+    1195311108
+
+    Let's jump back to almost the start of the file,
+
+    >>> make_virtual_offset(0, 2)
+    2
+    >>> handle.seek(2)
+    2
+    >>> handle.close()
+
+    Note that you can use the max_cache argument to limit the number of
+    BGZF blocks cached in memory. The default is 100, and since each
+    block can be up to 64kb, the default cache could take up to 6MB of
+    RAM. The cache is not important for reading through the file in one
+    pass, but is important for improving performance of random access.
+    """
+
+    def __init__(self, filename=None, mode="r", fileobj=None, max_cache=100):
+        """Initialize the class."""
+        # TODO - Assuming we can seek, check for 28 bytes EOF empty block
+        # and if missing warn about possible truncation (as in samtools)?
+        if max_cache < 1:
+            raise ValueError("Use max_cache with a minimum of 1")
+        # Must open the BGZF file in binary mode, but we may want to
+        # treat the contents as either text or binary (unicode or
+        # bytes under Python 3)
+        if fileobj:
+            assert filename is None
+            handle = fileobj
+            assert "b" in handle.mode.lower()
+        else:
+            if "w" in mode.lower() or "a" in mode.lower():
+                raise ValueError(
+                    "Must use read mode (default), not write or append mode"
+                )
+            handle = _open(filename, "rb")
+        self._text = "b" not in mode.lower()
+        if self._text:
+            self._newline = "\n"
+        else:
+            self._newline = b"\n"
+        self._handle = handle
+        self.max_cache = max_cache
+        self._buffers = {}
+        self._block_start_offset = None
+        self._block_raw_length = None
+        self._load_block(handle.tell())
+
+    def _load_block(self, start_offset=None):
+        if start_offset is None:
+            # If the file is being read sequentially, then _handle.tell()
+            # should be pointing at the start of the next block.
+            # However, if seek has been used, we can't assume that.
+            start_offset = self._block_start_offset + self._block_raw_length
+        if start_offset == self._block_start_offset:
+            self._within_block_offset = 0
+            return
+        elif start_offset in self._buffers:
+            # Already in cache
+            self._buffer, self._block_raw_length = self._buffers[start_offset]
+            self._within_block_offset = 0
+            self._block_start_offset = start_offset
+            return
+        # Must hit the disk... first check cache limits,
+        while len(self._buffers) >= self.max_cache:
+            # TODO - Implemente LRU cache removal?
+            self._buffers.popitem()
+        # Now load the block
+        handle = self._handle
+        if start_offset is not None:
+            handle.seek(start_offset)
+        self._block_start_offset = handle.tell()
+        try:
+            block_size, self._buffer = _load_bgzf_block(handle, self._text)
+        except StopIteration:
+            # EOF
+            block_size = 0
+            if self._text:
+                self._buffer = ""
+            else:
+                self._buffer = b""
+        self._within_block_offset = 0
+        self._block_raw_length = block_size
+        # Finally save the block in our cache,
+        self._buffers[self._block_start_offset] = self._buffer, block_size
+
+    def tell(self):
+        """Return a 64-bit unsigned BGZF virtual offset."""
+        if 0 < self._within_block_offset and self._within_block_offset == len(
+            self._buffer
+        ):
+            # Special case where we're right at the end of a (non empty) block.
+            # For non-maximal blocks could give two possible virtual offsets,
+            # but for a maximal block can't use 65536 as the within block
+            # offset. Therefore for consistency, use the next block and a
+            # within block offset of zero.
+            return (self._block_start_offset + self._block_raw_length) << 16
+        else:
+            # return make_virtual_offset(self._block_start_offset,
+            #                           self._within_block_offset)
+            # TODO - Include bounds checking as in make_virtual_offset?
+            return (self._block_start_offset << 16) | self._within_block_offset
+
+    def seek(self, virtual_offset):
+        """Seek to a 64-bit unsigned BGZF virtual offset."""
+        # Do this inline to avoid a function call,
+        # start_offset, within_block = split_virtual_offset(virtual_offset)
+        start_offset = virtual_offset >> 16
+        within_block = virtual_offset ^ (start_offset << 16)
+        if start_offset != self._block_start_offset:
+            # Don't need to load the block if already there
+            # (this avoids a function call since _load_block would do nothing)
+            self._load_block(start_offset)
+            assert start_offset == self._block_start_offset
+        if within_block > len(self._buffer):
+            if not (within_block == 0 and len(self._buffer) == 0):
+                raise ValueError(
+                    "Within offset %i but block size only %i"
+                    % (within_block, len(self._buffer))
+                )
+        self._within_block_offset = within_block
+        # assert virtual_offset == self.tell(), \
+        #    "Did seek to %i (%i, %i), but tell says %i (%i, %i)" \
+        #    % (virtual_offset, start_offset, within_block,
+        #       self.tell(), self._block_start_offset,
+        #       self._within_block_offset)
+        return virtual_offset
+
+    def read(self, size=-1):
+        """Read method for the BGZF module."""
+        if size < 0:
+            raise NotImplementedError("Don't be greedy, that could be massive!")
+
+        result = "" if self._text else b""
+        while size and self._buffer:
+            if self._within_block_offset + size <= len(self._buffer):
+                # This may leave us right at the end of a block
+                # (lazy loading, don't load the next block unless we have too)
+                data = self._buffer[
+                    self._within_block_offset : self._within_block_offset + size
+                ]
+                self._within_block_offset += size
+                assert data  # Must be at least 1 byte
+                result += data
+                break
+            else:
+                data = self._buffer[self._within_block_offset :]
+                size -= len(data)
+                self._load_block()  # will reset offsets
+                # TODO - Test with corner case of an empty block followed by
+                # a non-empty block
+                result += data
+
+        return result
+
+    def readline(self):
+        """Read a single line for the BGZF file."""
+        result = "" if self._text else b""
+        while self._buffer:
+            i = self._buffer.find(self._newline, self._within_block_offset)
+            # Three cases to consider,
+            if i == -1:
+                # No newline, need to read in more data
+                data = self._buffer[self._within_block_offset :]
+                self._load_block()  # will reset offsets
+                result += data
+            elif i + 1 == len(self._buffer):
+                # Found new line, but right at end of block (SPECIAL)
+                data = self._buffer[self._within_block_offset :]
+                # Must now load the next block to ensure tell() works
+                self._load_block()  # will reset offsets
+                assert data
+                result += data
+                break
+            else:
+                # Found new line, not at end of block (easy case, no IO)
+                data = self._buffer[self._within_block_offset : i + 1]
+                self._within_block_offset = i + 1
+                # assert data.endswith(self._newline)
+                result += data
+                break
+
+        return result
+
+    def __next__(self):
+        """Return the next line."""
+        line = self.readline()
+        if not line:
+            raise StopIteration
+        return line
+
+    def __iter__(self):
+        """Iterate over the lines in the BGZF file."""
+        return self
+
+    def close(self):
+        """Close BGZF file."""
+        self._handle.close()
+        self._buffer = None
+        self._block_start_offset = None
+        self._buffers = None
+
+    def seekable(self):
+        """Return True indicating the BGZF supports random access."""
+        return True
+
+    def isatty(self):
+        """Return True if connected to a TTY device."""
+        return False
+
+    def fileno(self):
+        """Return integer file descriptor."""
+        return self._handle.fileno()
+
+    def __enter__(self):
+        """Open a file operable with WITH statement."""
+        return self
+
+    def __exit__(self, type, value, traceback):
+        """Close a file with WITH statement."""
+        self.close()
+
+
+class BgzfWriter:
+    """Define a BGZFWriter object."""
+
+    def __init__(self, filename=None, mode="w", fileobj=None, compresslevel=6):
+        """Initilize the class."""
+        if fileobj:
+            assert filename is None
+            handle = fileobj
+        else:
+            if "w" not in mode.lower() and "a" not in mode.lower():
+                raise ValueError("Must use write or append mode, not %r" % mode)
+            if "a" in mode.lower():
+                raise NotImplementedError("Append mode is not implemented yet")
+                # handle = _open(filename, "ab")
+            else:
+                handle = _open(filename, "wb")
+        self._text = "b" not in mode.lower()
+        self._handle = handle
+        self._buffer = b""
+        self.compresslevel = compresslevel
+
+    def _write_block(self, block):
+        """Write provided data to file as a single BGZF compressed block (PRIVATE)."""
+        # print("Saving %i bytes" % len(block))
+        start_offset = self._handle.tell()
+        assert len(block) <= 65536
+        # Giving a negative window bits means no gzip/zlib headers,
+        # -15 used in samtools
+        c = zlib.compressobj(
+            self.compresslevel, zlib.DEFLATED, -15, zlib.DEF_MEM_LEVEL, 0
+        )
+        compressed = c.compress(block) + c.flush()
+        del c
+        if len(compressed) > 65536:
+            raise RuntimeError(
+                "TODO - Didn't compress enough, try less data in this block"
+            )
+        crc = zlib.crc32(block)
+        # Should cope with a mix of Python platforms...
+        if crc < 0:
+            crc = struct.pack("= 65536:
+                self._write_block(self._buffer[:65536])
+                self._buffer = self._buffer[65536:]
+
+    def flush(self):
+        """Flush data explicitally."""
+        while len(self._buffer) >= 65536:
+            self._write_block(self._buffer[:65535])
+            self._buffer = self._buffer[65535:]
+        self._write_block(self._buffer)
+        self._buffer = b""
+        self._handle.flush()
+
+    def close(self):
+        """Flush data, write 28 bytes BGZF EOF marker, and close BGZF file.
+
+        samtools will look for a magic EOF marker, just a 28 byte empty BGZF
+        block, and if it is missing warns the BAM file may be truncated. In
+        addition to samtools writing this block, so too does bgzip - so this
+        implementation does too.
+        """
+        if self._buffer:
+            self.flush()
+        self._handle.write(_bgzf_eof)
+        self._handle.flush()
+        self._handle.close()
+
+    def tell(self):
+        """Return a BGZF 64-bit virtual offset."""
+        return make_virtual_offset(self._handle.tell(), len(self._buffer))
+
+    def seekable(self):
+        """Return True indicating the BGZF supports random access."""
+        # Not seekable, but we do support tell...
+        return False
+
+    def isatty(self):
+        """Return True if connected to a TTY device."""
+        return False
+
+    def fileno(self):
+        """Return integer file descriptor."""
+        return self._handle.fileno()
+
+    def __enter__(self):
+        """Open a file operable with WITH statement."""
+        return self
+
+    def __exit__(self, type, value, traceback):
+        """Close a file with WITH statement."""
+        self.close()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        print("Call this with no arguments and pipe uncompressed data in on stdin")
+        print("and it will produce BGZF compressed data on stdout. e.g.")
+        print("")
+        print("./bgzf.py < example.fastq > example.fastq.bgz")
+        print("")
+        print("The extension convention of *.bgz is to distinugish these from *.gz")
+        print("used for standard gzipped files without the block structure of BGZF.")
+        print("You can use the standard gunzip command to decompress BGZF files,")
+        print("if it complains about the extension try something like this:")
+        print("")
+        print("cat example.fastq.bgz | gunzip > example.fastq")
+        print("")
+        print("See also the tool bgzip that comes with samtools")
+        sys.exit(0)
+
+    # Ensure we have binary mode handles
+    # (leave stderr as default text mode)
+    stdin = sys.stdin.buffer
+    stdout = sys.stdout.buffer
+
+    sys.stderr.write("Producing BGZF output from stdin...\n")
+    w = BgzfWriter(fileobj=stdout)
+    while True:
+        data = stdin.read(65536)
+        w.write(data)
+        if not data:
+            break
+    # Doing close will write an empty BGZF block as EOF marker:
+    w.close()
+    sys.stderr.write("BGZF data produced\n")
diff --git a/code/lib/Bio/codonalign/__init__.py b/code/lib/Bio/codonalign/__init__.py
new file mode 100644
index 0000000..7ed1700
--- /dev/null
+++ b/code/lib/Bio/codonalign/__init__.py
@@ -0,0 +1,810 @@
+# Copyright 2013 by Zheng Ruan (zruan1991@gmail.com).
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Code for dealing with Codon Alignments."""
+
+import copy
+from collections.abc import Mapping, Iterable
+
+from Bio import BiopythonWarning
+from Bio import BiopythonExperimentalWarning
+
+from Bio.SeqRecord import SeqRecord
+from Bio.Data import CodonTable
+
+from Bio.codonalign.codonseq import CodonSeq
+from Bio.codonalign.codonalignment import CodonAlignment, mktest
+
+import warnings
+
+warnings.warn(
+    "Bio.codonalign is an experimental module which may undergo "
+    "significant changes prior to its future official release.",
+    BiopythonExperimentalWarning,
+)
+
+
+def build(
+    pro_align,
+    nucl_seqs,
+    corr_dict=None,
+    gap_char="-",
+    unknown="X",
+    codon_table=None,
+    complete_protein=False,
+    anchor_len=10,
+    max_score=10,
+):
+    """Build a codon alignment from protein alignment and corresponding nucleotides.
+
+    Arguments:
+     - pro_align  - a protein MultipleSeqAlignment object
+     - nucl_seqs - an object returned by SeqIO.parse or SeqIO.index
+       or a collection of SeqRecord.
+     - corr_dict  - a dict that maps protein id to nucleotide id
+     - complete_protein - whether the sequence begins with a start
+       codon
+
+    Return a CodonAlignment object.
+
+    The example below answers this Biostars question: https://www.biostars.org/p/89741/
+
+    >>> from Bio.Seq import Seq
+    >>> from Bio.SeqRecord import SeqRecord
+    >>> from Bio.Align import MultipleSeqAlignment
+    >>> from Bio.codonalign import build
+    >>> seq1 = SeqRecord(Seq('ATGTCTCGT'), id='pro1')
+    >>> seq2 = SeqRecord(Seq('ATGCGT'), id='pro2')
+    >>> pro1 = SeqRecord(Seq('MSR'), id='pro1')
+    >>> pro2 = SeqRecord(Seq('M-R'), id='pro2')
+    >>> aln = MultipleSeqAlignment([pro1, pro2])
+    >>> codon_aln = build(aln, [seq1, seq2])
+    >>> print(codon_aln)
+    CodonAlignment with 2 rows and 9 columns (3 codons)
+    ATGTCTCGT pro1
+    ATG---CGT pro2
+
+    """
+    # TODO
+    # add an option to allow the user to specify the returned object?
+
+    from Bio.Align import MultipleSeqAlignment
+
+    # check the type of object of pro_align
+    if not isinstance(pro_align, MultipleSeqAlignment):
+        raise TypeError("the first argument should be a MultipleSeqAlignment object")
+    # check whether the number of seqs in pro_align and nucl_seqs is
+    # the same
+    pro_num = len(pro_align)
+    if corr_dict is None:
+        try:
+            nucl_num = len(nucl_seqs)
+        except TypeError:
+            # nucl_seqs will be an iterator if returned by SeqIO.parse()
+            nucl_seqs = tuple(nucl_seqs)
+            nucl_num = len(nucl_seqs)
+        if pro_num > nucl_num:
+            raise ValueError(
+                f"Higher Number of SeqRecords in Protein Alignment ({pro_num}) "
+                f"than the Number of Nucleotide SeqRecords ({nucl_num}) are found!"
+            )
+
+        # Determine the protein sequences and nucl sequences
+        # correspondence. If nucl_seqs is a list, tuple or read by
+        # SeqIO.parse(), we assume the order of sequences in pro_align
+        # and nucl_seqs are the same. If nucl_seqs is a dict or read by
+        # SeqIO.index(), we match seqs in pro_align and those in
+        # nucl_seq by their id.
+        if isinstance(nucl_seqs, Mapping):
+            corr_method = 1
+        elif isinstance(nucl_seqs, Iterable):
+            corr_method = 0
+        else:
+            raise TypeError(
+                "Nucl Sequences Error, Unknown type to assign correspondence method"
+            )
+    else:
+        if not isinstance(corr_dict, dict):
+            raise TypeError(
+                "corr_dict should be a dict that corresponds "
+                "protein id to nucleotide id!"
+            )
+        if len(corr_dict) >= pro_num:
+            if isinstance(nucl_seqs, Mapping):
+                pass
+            else:
+                d = {}
+                for record in nucl_seqs:
+                    key = record.id
+                    if key in d:
+                        raise ValueError("Duplicate key '%s'" % key)
+                    d[key] = record
+                nucl_seqs = d
+            corr_method = 2
+        else:
+            raise RuntimeError(
+                f"Number of items in corr_dict ({len(corr_dict)}) "
+                f"is less than number of protein records ({pro_num})"
+            )
+
+    # set up pro-nucl correspondence based on corr_method
+    # corr_method = 0, consecutive pairing
+    if corr_method == 0:
+        pro_nucl_pair = zip(pro_align, nucl_seqs)
+    # corr_method = 1, keyword pairing
+    elif corr_method == 1:
+        nucl_id = set(nucl_seqs.keys())
+        pro_id = {i.id for i in pro_align}
+        # check if there is pro_id that does not have a nucleotide match
+        if pro_id - nucl_id:
+            diff = pro_id - nucl_id
+            raise ValueError(
+                f"Protein Record {', '.join(diff)} cannot find a "
+                "nucleotide sequence match, please check the id"
+            )
+        else:
+            pro_nucl_pair = []
+            for pro_rec in pro_align:
+                pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id]))
+    # corr_method = 2, dict pairing
+    elif corr_method == 2:
+        pro_nucl_pair = []
+        for pro_rec in pro_align:
+            try:
+                nucl_id = corr_dict[pro_rec.id]
+            except KeyError:
+                print("Protein record (%s) is not in corr_dict!" % pro_rec.id)
+                exit(1)
+            pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id]))
+
+    if codon_table is None:
+        codon_table = CodonTable.generic_by_id[1]
+
+    codon_aln = []
+    shift = False
+    for pair in pro_nucl_pair:
+        # Beware that the following span corresponds to an ungapped
+        # nucleotide sequence.
+        corr_span = _check_corr(
+            pair[0],
+            pair[1],
+            gap_char=gap_char,
+            codon_table=codon_table,
+            complete_protein=complete_protein,
+            anchor_len=anchor_len,
+        )
+        if not corr_span:
+            raise ValueError(
+                f"Protein Record {pair[0].id} and "
+                f"Nucleotide Record {pair[1].id} do not match!"
+            )
+        else:
+            codon_rec = _get_codon_rec(
+                pair[0],
+                pair[1],
+                corr_span,
+                gap_char=gap_char,
+                complete_protein=complete_protein,
+                codon_table=codon_table,
+                max_score=max_score,
+            )
+            codon_aln.append(codon_rec)
+            if corr_span[1] == 2:
+                shift = True
+    if shift:
+        return CodonAlignment(_align_shift_recs(codon_aln))
+    else:
+        return CodonAlignment(codon_aln)
+
+
+def _codons2re(codons):
+    """Generate regular expression based on a given list of codons (PRIVATE)."""
+    reg = ""
+    for i in zip(*codons):
+        if len(set(i)) == 1:
+            reg += "".join(set(i))
+        else:
+            reg += "[" + "".join(set(i)) + "]"
+    return reg
+
+
+def _get_aa_regex(codon_table, stop="*", unknown="X"):
+    """Set up the regular expression of a given CodonTable (PRIVATE).
+
+    >>> from Bio.Data.CodonTable import generic_by_id
+    >>> p = generic_by_id[1]
+    >>> t = _get_aa_regex(p)
+    >>> print(t['A'][0])
+    G
+    >>> print(t['A'][1])
+    C
+    >>> print(sorted(list(t['A'][2:])))
+    ['A', 'C', 'G', 'T', 'U', '[', ']']
+    >>> print(sorted(list(t['L'][:5])))
+    ['C', 'T', 'U', '[', ']']
+    >>> print(sorted(list(t['L'][5:9])))
+    ['T', 'U', '[', ']']
+    >>> print(sorted(list(t['L'][9:])))
+    ['A', 'C', 'G', 'T', 'U', '[', ']']
+
+    """
+    from Bio.Data.CodonTable import CodonTable
+
+    if not isinstance(codon_table, CodonTable):
+        raise TypeError("Input table is not a instance of Bio.Data.CodonTable object")
+    aa2codon = {}
+    for codon, aa in codon_table.forward_table.items():
+        aa2codon.setdefault(aa, []).append(codon)
+    for aa, codons in aa2codon.items():
+        aa2codon[aa] = _codons2re(codons)
+    aa2codon[stop] = _codons2re(codon_table.stop_codons)
+    aa2codon[unknown] = "..."
+    return aa2codon
+
+
+def _check_corr(
+    pro, nucl, gap_char, codon_table, complete_protein=False, anchor_len=10,
+):
+    """Check if the nucleotide can be translated into the protein (PRIVATE).
+
+    Expects two SeqRecord objects.
+    """
+    import re
+
+    if not isinstance(pro, SeqRecord) or not isinstance(nucl, SeqRecord):
+        raise TypeError(
+            "_check_corr accepts two SeqRecord object. Please check your input."
+        )
+
+    aa2re = _get_aa_regex(codon_table)
+    pro_re = ""
+    for aa in pro.seq:
+        if aa != gap_char:
+            pro_re += aa2re[aa]
+
+    nucl_seq = str(nucl.seq.upper().replace(gap_char, ""))
+    match = re.search(pro_re, nucl_seq)
+    if match:
+        # mode = 0, direct match
+        return (match.span(), 0)
+    else:
+        # Might caused by mismatches or frameshift, using anchors to
+        # have a try
+        # anchor_len = 10 # adjust this value to test performance
+        pro_seq = str(pro.seq).replace(gap_char, "")
+        anchors = [
+            pro_seq[i : (i + anchor_len)] for i in range(0, len(pro_seq), anchor_len)
+        ]
+        # if the last anchor is less than the specified anchor
+        # size, we combine the penultimate and the last anchor
+        # together as the last one.
+        # TODO: modify this to deal with short sequence with only
+        # one anchor.
+        if len(anchors[-1]) < anchor_len:
+            anchors[-1] = anchors[-2] + anchors[-1]
+
+        pro_re = []
+        anchor_distance = 0
+        anchor_pos = []
+        for i, anchor in enumerate(anchors):
+            this_anchor_len = len(anchor)
+            qcodon = ""
+            fncodon = ""
+            # dirty code to deal with the last anchor
+            # as the last anchor is combined in the steps
+            # above, we need to get the true last anchor to
+            # pro_re
+            if this_anchor_len == anchor_len:
+                for aa in anchor:
+                    if complete_protein and i == 0:
+                        qcodon += _codons2re(codon_table.start_codons)
+                        fncodon += aa2re["X"]
+                        continue
+                    qcodon += aa2re[aa]
+                    fncodon += aa2re["X"]
+                match = re.search(qcodon, nucl_seq)
+            elif this_anchor_len > anchor_len:
+                last_qcodon = ""
+                last_fcodon = ""
+                for j in range(anchor_len, len(anchor)):
+                    last_qcodon += aa2re[anchor[j]]
+                    last_fcodon += aa2re["X"]
+                match = re.search(last_qcodon, nucl_seq)
+            # build full_pro_re from anchors
+            if match:
+                anchor_pos.append((match.start(), match.end(), i))
+                if this_anchor_len == anchor_len:
+                    pro_re.append(qcodon)
+                else:
+                    pro_re.append(last_qcodon)
+            else:
+                if this_anchor_len == anchor_len:
+                    pro_re.append(fncodon)
+                else:
+                    pro_re.append(last_fcodon)
+        full_pro_re = "".join(pro_re)
+        match = re.search(full_pro_re, nucl_seq)
+        if match:
+            # mode = 1, mismatch
+            return (match.span(), 1)
+        else:
+            # check frames of anchors
+            # ten frameshift events are allowed in a sequence
+            first_anchor = True
+            shift_id_pos = 0
+            # check the first anchor
+            if first_anchor and anchor_pos[0][2] != 0:
+                shift_val_lst = [1, 2, 3 * anchor_len - 2, 3 * anchor_len - 1, 0]
+                sh_anc = anchors[0]
+                for shift_val in shift_val_lst:
+                    if shift_val == 0:
+                        qcodon = None
+                        break
+                    if shift_val in (1, 2):
+                        sh_nuc_len = anchor_len * 3 + shift_val
+                    elif shift_val in (3 * anchor_len - 2, 3 * anchor_len - 1):
+                        sh_nuc_len = anchor_len * 3 - (3 * anchor_len - shift_val)
+                    if anchor_pos[0][0] >= sh_nuc_len:
+                        sh_nuc = nucl_seq[
+                            anchor_pos[0][0] - sh_nuc_len : anchor_pos[0][0]
+                        ]
+                    else:
+                        # this is unlikely to produce the correct output
+                        sh_nuc = nucl_seq[: anchor_pos[0][0]]
+                    qcodon, shift_id_pos = _get_shift_anchor_re(
+                        sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos
+                    )
+                    if qcodon is not None and qcodon != -1:
+                        # pro_re[0] should be '.'*anchor_len, therefore I
+                        # replace it.
+                        pro_re[0] = qcodon
+                        break
+                if qcodon == -1:
+                    warnings.warn(
+                        f"first frameshift detection failed for {nucl.id}",
+                        BiopythonWarning,
+                    )
+            # check anchors in the middle
+            for i in range(len(anchor_pos) - 1):
+                shift_val = (anchor_pos[i + 1][0] - anchor_pos[i][0]) % (3 * anchor_len)
+                sh_anc = "".join(anchors[anchor_pos[i][2] : anchor_pos[i + 1][2]])
+                sh_nuc = nucl_seq[anchor_pos[i][0] : anchor_pos[i + 1][0]]
+                qcodon = None
+                if shift_val != 0:
+                    qcodon, shift_id_pos = _get_shift_anchor_re(
+                        sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos
+                    )
+                if qcodon is not None and qcodon != -1:
+                    pro_re[anchor_pos[i][2] : anchor_pos[i + 1][2]] = [qcodon]
+                    qcodon = None
+                elif qcodon == -1:
+                    warnings.warn(
+                        f"middle frameshift detection failed for {nucl.id}",
+                        BiopythonWarning,
+                    )
+            # check the last anchor
+            if anchor_pos[-1][2] + 1 == len(anchors) - 1:
+                sh_anc = anchors[-1]
+                this_anchor_len = len(sh_anc)
+                shift_val_lst = [
+                    1,
+                    2,
+                    3 * this_anchor_len - 2,
+                    3 * this_anchor_len - 1,
+                    0,
+                ]
+                for shift_val in shift_val_lst:
+                    if shift_val == 0:
+                        qcodon = None
+                        break
+                    if shift_val in (1, 2):
+                        sh_nuc_len = this_anchor_len * 3 + shift_val
+                    elif shift_val in (
+                        3 * this_anchor_len - 2,
+                        3 * this_anchor_len - 1,
+                    ):
+                        sh_nuc_len = this_anchor_len * 3 - (
+                            3 * this_anchor_len - shift_val
+                        )
+                    if len(nucl_seq) - anchor_pos[-1][0] >= sh_nuc_len:
+                        sh_nuc = nucl_seq[
+                            anchor_pos[-1][0] : anchor_pos[-1][0] + sh_nuc_len
+                        ]
+                    else:
+                        # this is unlikely to produce the correct output
+                        sh_nuc = nucl_seq[anchor_pos[-1][0] :]
+                    qcodon, shift_id_pos = _get_shift_anchor_re(
+                        sh_anc, sh_nuc, shift_val, aa2re, this_anchor_len, shift_id_pos
+                    )
+                    if qcodon is not None and qcodon != -1:
+                        pro_re.pop()
+                        pro_re[-1] = qcodon
+                        break
+                if qcodon == -1:
+                    warnings.warn(
+                        f"last frameshift detection failed for {nucl.id}",
+                        BiopythonWarning,
+                    )
+            # try global match
+            full_pro_re = "".join(pro_re)
+            match = re.search(full_pro_re, nucl_seq)
+            if match:
+                return (match.span(), 2, match)
+            else:
+                raise RuntimeError(
+                    f"Protein SeqRecord ({pro.id}) and "
+                    f"Nucleotide SeqRecord ({nucl.id}) do not match!"
+                )
+
+
+def _get_shift_anchor_re(sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos):
+    """Find a regular expression matching a potentially shifted anchor (PRIVATE).
+
+    Arguments:
+     - sh_anc    - shifted anchor sequence
+     - sh_nuc    - potentially corresponding nucleotide sequence
+       of sh_anc
+     - shift_val - 1 or 2 indicates forward frame shift, whereas
+       3*anchor_len-1 or 3*anchor_len-2 indicates
+       backward shift
+     - aa2re     - aa to codon re dict
+     - anchor_len - length of the anchor
+     - shift_id_pos - specify current shift name we are at
+
+    """
+    import re
+
+    shift_id = [chr(i) for i in range(97, 107)]
+    if 0 < shift_val < 3 * anchor_len - 2:
+        # if shift_val in (1, 2):
+        for j in range(len(sh_anc)):
+            qcodon = "^"
+            for k, aa in enumerate(sh_anc):
+                if k == j:
+                    qcodon += aa2re[aa] + "(?P<" + shift_id[shift_id_pos] + ">..*)"
+                else:
+                    qcodon += aa2re[aa]
+            qcodon += "$"
+            match = re.search(qcodon, sh_nuc)
+            if match:
+                qcodon = qcodon.replace("^", "").replace("$", "")
+                shift_id_pos += 1
+                return qcodon, shift_id_pos
+        if not match:
+            # failed to find a match (frameshift)
+            return -1, shift_id_pos
+    elif shift_val in (3 * anchor_len - 1, 3 * anchor_len - 2):
+        shift_val = 3 * anchor_len - shift_val
+        # obtain shifted anchor and corresponding nucl
+        # first check if the shifted pos is just at the end of the
+        # previous anchor.
+        for j in range(1, len(sh_anc)):
+            qcodon = "^"
+            for k, aa in enumerate(sh_anc):
+                if k == j - 1:
+                    # will be considered in the next step
+                    pass
+                elif k == j:
+                    qcodon += _merge_aa2re(
+                        sh_anc[j - 1],
+                        sh_anc[j],
+                        shift_val,
+                        aa2re,
+                        shift_id[shift_id_pos].upper(),
+                    )
+                else:
+                    qcodon += aa2re[aa]
+            qcodon += "$"
+            match = re.search(qcodon, sh_nuc)
+            if match:
+                qcodon = qcodon.replace("^", "").replace("$", "")
+                shift_id_pos += 1
+                return qcodon, shift_id_pos
+        if not match:
+            # failed to find a match (frameshift)
+            return -1, shift_id_pos
+
+
+def _merge_aa2re(aa1, aa2, shift_val, aa2re, reid):
+    """Merge two amino acids based on detected frame shift value (PRIVATE)."""
+
+    def get_aa_from_codonre(re_aa):
+        aas = []
+        m = 0
+        for i in re_aa:
+            if i == "[":
+                m = -1
+                aas.append("")
+            elif i == "]":
+                m = 0
+                continue
+            elif m == -1:
+                aas[-1] = aas[-1] + i
+            elif m == 0:
+                aas.append(i)
+        return aas
+
+    scodon = list(map(get_aa_from_codonre, (aa2re[aa1], aa2re[aa2])))
+    if shift_val == 1:
+        intersect = "".join(set(scodon[0][2]) & set(scodon[1][0]))
+        scodonre = "(?P<" + reid + ">"
+        scodonre += (
+            "["
+            + scodon[0][0]
+            + "]"
+            + "["
+            + scodon[0][1]
+            + "]"
+            + "["
+            + intersect
+            + "]"
+            + "["
+            + scodon[1][1]
+            + "]"
+            + "["
+            + scodon[1][2]
+            + "]"
+        )
+    elif shift_val == 2:
+        intersect1 = "".join(set(scodon[0][1]) & set(scodon[1][0]))
+        intersect2 = "".join(set(scodon[0][2]) & set(scodon[1][1]))
+        scodonre = "(?P<" + reid + ">"
+        scodonre += (
+            "["
+            + scodon[0][0]
+            + "]"
+            + "["
+            + intersect1
+            + "]"
+            + "["
+            + intersect2
+            + "]"
+            + "["
+            + scodon[1][2]
+            + "]"
+        )
+    scodonre += ")"
+    return scodonre
+
+
+def _get_codon_rec(
+    pro, nucl, span_mode, gap_char, codon_table, complete_protein=False, max_score=10,
+):
+    """Generate codon alignment based on regular re match (PRIVATE).
+
+    span_mode is a tuple returned by _check_corr. The first element
+    is the span of a re search, and the second element is the mode
+    for the match.
+
+    mode
+     - 0: direct match
+     - 1: mismatch (no indels)
+     - 2: frameshift
+
+    """
+    import re
+    from Bio.Seq import Seq
+
+    nucl_seq = nucl.seq.replace(gap_char, "")
+    span = span_mode[0]
+    mode = span_mode[1]
+    aa2re = _get_aa_regex(codon_table)
+    if mode in (0, 1):
+        if len(pro.seq.replace(gap_char, "")) * 3 != (span[1] - span[0]):
+            raise ValueError(
+                f"Protein Record {pro.id} and "
+                f"Nucleotide Record {nucl.id} do not match!"
+            )
+        aa_num = 0
+        codon_seq = CodonSeq()
+        for aa in pro.seq:
+            if aa == "-":
+                codon_seq += "---"
+            elif complete_protein and aa_num == 0:
+                this_codon = nucl_seq[span[0] : span[0] + 3]
+                if not re.search(
+                    _codons2re(codon_table.start_codons), str(this_codon.upper())
+                ):
+                    max_score -= 1
+                    warnings.warn(
+                        f"start codon of {pro.id} ({aa} {aa_num}) does not "
+                        f"correspond to {nucl.id} ({this_codon})",
+                        BiopythonWarning,
+                    )
+                if max_score == 0:
+                    raise RuntimeError(
+                        f"max_score reached for {nucl.id}! Please raise up "
+                        "the tolerance to get an alignment in anyway"
+                    )
+                codon_seq += this_codon
+                aa_num += 1
+            else:
+                this_codon = nucl_seq[span[0] + 3 * aa_num : span[0] + 3 * (aa_num + 1)]
+                if this_codon.upper().translate(table=codon_table) != aa:
+                    max_score -= 1
+                    warnings.warn(
+                        "%s(%s %d) does not correspond to %s(%s)"
+                        % (pro.id, aa, aa_num, nucl.id, this_codon),
+                        BiopythonWarning,
+                    )
+                if max_score == 0:
+                    raise RuntimeError(
+                        f"max_score reached for {nucl.id}! Please raise up "
+                        "the tolerance to get an alignment in anyway"
+                    )
+                codon_seq += this_codon
+                aa_num += 1
+        return SeqRecord(codon_seq, id=nucl.id)
+    elif mode == 2:
+        from collections import deque
+
+        shift_pos = deque([])
+        shift_start = []
+        match = span_mode[2]
+        m_groupdict = list(match.groupdict().keys())
+        # backward frameshift
+        for i in m_groupdict:
+            shift_pos.append(match.span(i))
+            shift_start.append(match.start(i))
+        rf_table = []
+        i = match.start()
+        while True:
+            rf_table.append(i)
+            i += 3
+            if i in shift_start and m_groupdict[shift_start.index(i)].isupper():
+                shift_index = shift_start.index(i)
+                shift_val = 6 - (shift_pos[shift_index][1] - shift_pos[shift_index][0])
+                rf_table.append(i)
+                rf_table.append(i + 3 - shift_val)
+                i = shift_pos[shift_index][1]
+            elif i in shift_start and m_groupdict[shift_start.index(i)].islower():
+                i = shift_pos[shift_start.index(i)][1]
+            if i >= match.end():
+                break
+        codon_seq = CodonSeq()
+        aa_num = 0
+        for aa in pro.seq:
+            if aa == "-":
+                codon_seq += "---"
+            elif complete_protein and aa_num == 0:
+                this_codon = nucl_seq[rf_table[0] : rf_table[0] + 3]
+                if not re.search(
+                    _codons2re(codon_table.start_codons), str(this_codon.upper())
+                ):
+                    max_score -= 1
+                    warnings.warn(
+                        f"start codon of {pro.id}({aa} {aa_num}) does not "
+                        f"correspond to {nucl.id}({this_codon})",
+                        BiopythonWarning,
+                    )
+                    codon_seq += this_codon
+                    aa_num += 1
+            else:
+                if (
+                    aa_num < len(pro.seq.replace("-", "")) - 1
+                    and rf_table[aa_num + 1] - rf_table[aa_num] - 3 < 0
+                ):
+                    max_score -= 1
+                    start = rf_table[aa_num]
+                    end = start + (3 - shift_val)
+                    ngap = shift_val
+                    this_codon = nucl_seq[start:end] + "-" * ngap
+                elif rf_table[aa_num] - rf_table[aa_num - 1] - 3 > 0:
+                    max_score -= 1
+                    start = rf_table[aa_num - 1] + 3
+                    end = rf_table[aa_num]
+                    ngap = 3 - (rf_table[aa_num] - rf_table[aa_num - 1] - 3)
+                    this_codon = (
+                        nucl_seq[start:end]
+                        + "-" * ngap
+                        + nucl_seq[rf_table[aa_num] : rf_table[aa_num] + 3]
+                    )
+                else:
+                    start = rf_table[aa_num]
+                    end = start + 3
+                    this_codon = nucl_seq[start:end]
+                    if this_codon.upper().translate(table=codon_table) != aa:
+                        max_score -= 1
+                        warnings.warn(
+                            f"Codon of {pro.id}({aa} {aa_num}) does not "
+                            f"correspond to {nucl.id}({this_codon})",
+                            BiopythonWarning,
+                        )
+                if max_score == 0:
+                    raise RuntimeError(
+                        f"max_score reached for {nucl.id}! Please raise up "
+                        "the tolerance to get an alignment in anyway"
+                    )
+                codon_seq += this_codon
+                aa_num += 1
+        codon_seq.rf_table = rf_table
+        return SeqRecord(codon_seq, id=nucl.id)
+
+
+def _align_shift_recs(recs):
+    """Build alignment according to the frameshift detected by _check_corr (PRIVATE).
+
+    Argument:
+     - recs - a list of SeqRecords containing a CodonSeq dictated
+       by a rf_table (with frameshift in some of them).
+
+    """
+
+    def find_next_int(k, lst):
+        idx = lst.index(k)
+        p = 0
+        while True:
+            if isinstance(lst[idx + p], int):
+                return lst[idx + p], p
+            p += 1
+
+    full_rf_table_lst = [rec.seq.get_full_rf_table() for rec in recs]
+    rf_num = [0] * len(recs)
+    for k, rec in enumerate(recs):
+        for i in rec.seq.get_full_rf_table():
+            if isinstance(i, int):
+                rf_num[k] += 1
+            # isinstance(i, float) should be True
+            elif rec.seq[int(i) : int(i) + 3] == "---":
+                rf_num[k] += 1
+    if len(set(rf_num)) != 1:
+        raise RuntimeError("Number of alignable codons unequal in given records")
+    i = 0
+    rec_num = len(recs)
+    while True:
+        add_lst = []
+        try:
+            col_rf_lst = [k[i] for k in full_rf_table_lst]
+        except IndexError:
+            # we probably reached the last codon
+            break
+        for j, k in enumerate(col_rf_lst):
+            add_lst.append((j, int(k)))
+            if isinstance(k, float) and recs[j].seq[int(k) : int(k) + 3] != "---":
+                m, p = find_next_int(k, full_rf_table_lst[j])
+                if (m - k) % 3 != 0:
+                    gap_num = 3 - (m - k) % 3
+                else:
+                    gap_num = 0
+                if gap_num != 0:
+                    gaps = "-" * int(gap_num)
+                    seq = CodonSeq(rf_table=recs[j].seq.rf_table)
+                    seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :]
+                    full_rf_table = full_rf_table_lst[j]
+                    bp = full_rf_table.index(k)
+                    full_rf_table = full_rf_table[:bp] + [
+                        v + int(gap_num) for v in full_rf_table[bp + 1 :]
+                    ]
+                    full_rf_table_lst[j] = full_rf_table
+                    recs[j].seq = seq
+                add_lst.pop()
+                gap_num += m - k
+                i += p - 1
+        if len(add_lst) != rec_num:
+            for j, k in add_lst:
+                seq = CodonSeq(rf_table=recs[j].seq.rf_table)
+                gaps = "-" * int(gap_num)
+                seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :]
+                full_rf_table = full_rf_table_lst[j]
+                bp = full_rf_table.index(k)
+                inter_rf = []
+                for t in range(0, len(gaps), 3):
+                    inter_rf.append(k + t + 3.0)
+                full_rf_table = (
+                    full_rf_table[:bp]
+                    + inter_rf
+                    + [v + int(gap_num) for v in full_rf_table[bp:]]
+                )
+                full_rf_table_lst[j] = full_rf_table
+                recs[j].seq = seq
+        i += 1
+    return recs
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/codonalign/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/codonalign/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..5dcc070
Binary files /dev/null and b/code/lib/Bio/codonalign/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/codonalign/__pycache__/chisq.cpython-37.pyc b/code/lib/Bio/codonalign/__pycache__/chisq.cpython-37.pyc
new file mode 100644
index 0000000..b4a3a28
Binary files /dev/null and b/code/lib/Bio/codonalign/__pycache__/chisq.cpython-37.pyc differ
diff --git a/code/lib/Bio/codonalign/__pycache__/codonalignment.cpython-37.pyc b/code/lib/Bio/codonalign/__pycache__/codonalignment.cpython-37.pyc
new file mode 100644
index 0000000..d424822
Binary files /dev/null and b/code/lib/Bio/codonalign/__pycache__/codonalignment.cpython-37.pyc differ
diff --git a/code/lib/Bio/codonalign/__pycache__/codonseq.cpython-37.pyc b/code/lib/Bio/codonalign/__pycache__/codonseq.cpython-37.pyc
new file mode 100644
index 0000000..a7f1b5d
Binary files /dev/null and b/code/lib/Bio/codonalign/__pycache__/codonseq.cpython-37.pyc differ
diff --git a/code/lib/Bio/codonalign/chisq.py b/code/lib/Bio/codonalign/chisq.py
new file mode 100644
index 0000000..6b1ff5f
--- /dev/null
+++ b/code/lib/Bio/codonalign/chisq.py
@@ -0,0 +1,148 @@
+"""Python implementation of chisqprob, to avoid SciPy dependency.
+
+Adapted from SciPy: scipy/special/cephes/{chdtr,igam}.
+"""
+
+import math
+
+# Cephes Math Library Release 2.0:  April, 1987
+# Copyright 1985, 1987 by Stephen L. Moshier
+# Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+MACHEP = 0.0000001  # the machine roundoff error / tolerance
+BIG = 4.503599627370496e15
+BIGINV = 2.22044604925031308085e-16
+
+
+def chisqprob(x, df):
+    """Probability value (1-tail) for the Chi^2 probability distribution.
+
+    Broadcasting rules apply.
+
+    Parameters
+    ----------
+    x : array_like or float > 0
+
+    df : array_like or float, probably int >= 1
+
+    Returns
+    -------
+    chisqprob : ndarray
+        The area from ``chisq`` to infinity under the Chi^2 probability
+        distribution with degrees of freedom ``df``.
+
+    """
+    if x <= 0:
+        return 1.0
+    if x == 0:
+        return 0.0
+    if df <= 0:
+        raise ValueError("Domain error.")
+    if x < 1.0 or x < df:
+        return 1.0 - _igam(0.5 * df, 0.5 * x)
+    return _igamc(0.5 * df, 0.5 * x)
+
+
+def _igamc(a, x):
+    """Complemented incomplete Gamma integral (PRIVATE).
+
+    Parameters
+    ----------
+    a: float
+    x: float
+
+    Returns
+    -------
+    float
+
+    Notes
+    -----
+    The function is defined by::
+
+        igamc(a,x)   =   1 - igam(a,x)
+
+                                inf.
+                                   -
+                          1       | |  -t  a-1
+                    =   -----     |   e   t   dt.
+                         -      | |
+                        | (a)    -
+                                    x
+
+    In this implementation both arguments must be positive.
+    The integral is evaluated by either a power series or
+    continued fraction expansion, depending on the relative
+    values of a and x.
+
+    """
+    # Compute  x**a * exp(-x) / Gamma(a)
+    ax = math.exp(a * math.log(x) - x - math.lgamma(a))
+
+    # Continued fraction
+    y = 1.0 - a
+    z = x + y + 1.0
+    c = 0.0
+    pkm2 = 1.0
+    qkm2 = x
+    pkm1 = x + 1.0
+    qkm1 = z * x
+    ans = pkm1 / qkm1
+    while True:
+        c += 1.0
+        y += 1.0
+        z += 2.0
+        yc = y * c
+        pk = pkm1 * z - pkm2 * yc
+        qk = qkm1 * z - qkm2 * yc
+        if qk != 0:
+            r = pk / qk
+            t = abs((ans - r) / r)
+            ans = r
+        else:
+            t = 1.0
+        pkm2 = pkm1
+        pkm1 = pk
+        qkm2 = qkm1
+        qkm1 = qk
+        if abs(pk) > BIG:
+            pkm2 *= BIGINV
+            pkm1 *= BIGINV
+            qkm2 *= BIGINV
+            qkm1 *= BIGINV
+        if t <= MACHEP:
+            return ans * ax
+
+
+def _igam(a, x):
+    """Left tail of incomplete Gamma function (PRIVATE).
+
+    Computes this formula::
+
+                 inf.      k
+          a  -x   -       x
+         x  e     >   ----------
+                  -     -
+                k=0   | (a+k+1)
+
+    """
+    # Compute  x**a * exp(-x) / Gamma(a)
+    ax = math.exp(a * math.log(x) - x - math.lgamma(a))
+
+    # Power series
+    r = a
+    c = 1.0
+    ans = 1.0
+
+    while True:
+        r += 1.0
+        c *= x / r
+        ans += c
+        if c / ans <= MACHEP:
+            return ans * ax / a
+
+
+# --- Speed ---
+
+# try:
+#    from scipy.stats import chisqprob
+# except ImportError:
+#    pass
diff --git a/code/lib/Bio/codonalign/codonalignment.py b/code/lib/Bio/codonalign/codonalignment.py
new file mode 100644
index 0000000..f5570b6
--- /dev/null
+++ b/code/lib/Bio/codonalign/codonalignment.py
@@ -0,0 +1,513 @@
+# Copyright 2013 by Zheng Ruan (zruan1991@gmail.com).
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+"""Code for dealing with Codon Alignment.
+
+CodonAlignment class is inherited from MultipleSeqAlignment class. This is
+the core class to deal with codon alignment in biopython.
+"""
+
+import warnings
+
+from Bio.Align import MultipleSeqAlignment
+from Bio.SeqRecord import SeqRecord
+from Bio.Data import CodonTable
+from Bio import BiopythonWarning
+
+
+from Bio.codonalign.codonseq import _get_codon_list, CodonSeq, cal_dn_ds
+from Bio.codonalign.chisq import chisqprob
+
+
+class CodonAlignment(MultipleSeqAlignment):
+    """Codon Alignment class that inherits from MultipleSeqAlignment.
+
+    >>> from Bio.SeqRecord import SeqRecord
+    >>> a = SeqRecord(CodonSeq("AAAACGTCG"), id="Alpha")
+    >>> b = SeqRecord(CodonSeq("AAA---TCG"), id="Beta")
+    >>> c = SeqRecord(CodonSeq("AAAAGGTGG"), id="Gamma")
+    >>> print(CodonAlignment([a, b, c]))
+    CodonAlignment with 3 rows and 9 columns (3 codons)
+    AAAACGTCG Alpha
+    AAA---TCG Beta
+    AAAAGGTGG Gamma
+
+    """
+
+    def __init__(self, records="", name=None):
+        """Initialize the class."""
+        MultipleSeqAlignment.__init__(self, records)
+
+        # check the type of the alignment to be nucleotide
+        for rec in self:
+            if not isinstance(rec.seq, CodonSeq):
+                raise TypeError(
+                    "CodonSeq objects are expected in each SeqRecord in CodonAlignment"
+                )
+
+        if self.get_alignment_length() % 3 != 0:
+            raise ValueError(
+                "Alignment length is not a multiple of "
+                "three (i.e. a whole number of codons)"
+            )
+
+    def __str__(self):
+        """Return a multi-line string summary of the alignment.
+
+        This output is indicated to be readable, but large alignment
+        is shown truncated. A maximum of 20 rows (sequences) and
+        60 columns (20 codons) are shown, with the record identifiers.
+        This should fit nicely on a single screen. e.g.
+
+        """
+        rows = len(self._records)
+        lines = [
+            "CodonAlignment with %i rows and %i columns (%i codons)"
+            % (rows, self.get_alignment_length(), self.get_aln_length(),)
+        ]
+
+        if rows <= 60:
+            lines.extend([self._str_line(rec, length=60) for rec in self._records])
+        else:
+            lines.extend([self._str_line(rec, length=60) for rec in self._records[:18]])
+            lines.append("...")
+            lines.append(self._str_line(self._records[-1], length=60))
+        return "\n".join(lines)
+
+    def __getitem__(self, index):
+        """Return a CodonAlignment object for single indexing."""
+        if isinstance(index, int):
+            return self._records[index]
+        elif isinstance(index, slice):
+            return CodonAlignment(self._records[index])
+        elif len(index) != 2:
+            raise TypeError("Invalid index type.")
+        # Handle double indexing
+        row_index, col_index = index
+        if isinstance(row_index, int):
+            return self._records[row_index][col_index]
+        elif isinstance(col_index, int):
+            return "".join(str(rec[col_index]) for rec in self._records[row_index])
+        else:
+            return MultipleSeqAlignment(
+                rec[col_index] for rec in self._records[row_index]
+            )
+
+    def __add__(self, other):
+        """Combine two codonalignments with the same number of rows by adding them.
+
+        The method also allows to combine a CodonAlignment object with a
+        MultipleSeqAlignment object. The following rules apply:
+
+            * CodonAlignment + CodonAlignment -> CodonAlignment
+            * CodonAlignment + MultipleSeqAlignment -> MultipleSeqAlignment
+        """
+        if isinstance(other, CodonAlignment):
+            if len(self) != len(other):
+                raise ValueError(
+                    "When adding two alignments they must have the same length"
+                    " (i.e. same number or rows)"
+                )
+            warnings.warn(
+                "Please make sure the two CodonAlignment objects are sharing the same codon table. This is not checked by Biopython.",
+                BiopythonWarning,
+            )
+            merged = (
+                SeqRecord(seq=CodonSeq(left.seq + right.seq))
+                for left, right in zip(self, other)
+            )
+            return CodonAlignment(merged)
+        elif isinstance(other, MultipleSeqAlignment):
+            if len(self) != len(other):
+                raise ValueError(
+                    "When adding two alignments they must have the same length"
+                    " (i.e. same number or rows)"
+                )
+            return self.toMultipleSeqAlignment() + other
+        else:
+            raise TypeError(
+                "Only CodonAlignment or MultipleSeqAlignment object can be"
+                f" added with a CodonAlignment object. {object(other)} detected."
+            )
+
+    def get_aln_length(self):
+        """Get alignment length."""
+        return self.get_alignment_length() // 3
+
+    def toMultipleSeqAlignment(self):
+        """Convert the CodonAlignment to a MultipleSeqAlignment.
+
+        Return a MultipleSeqAlignment containing all the
+        SeqRecord in the CodonAlignment using Seq to store
+        sequences
+        """
+        alignments = [SeqRecord(rec.seq.toSeq(), id=rec.id) for rec in self._records]
+        return MultipleSeqAlignment(alignments)
+
+    def get_dn_ds_matrix(self, method="NG86", codon_table=None):
+        """Available methods include NG86, LWL85, YN00 and ML.
+
+        Argument:
+         - method       - Available methods include NG86, LWL85, YN00 and ML.
+         - codon_table  - Codon table to use for forward translation.
+
+        """
+        from Bio.Phylo.TreeConstruction import DistanceMatrix as DM
+
+        if codon_table is None:
+            codon_table = CodonTable.generic_by_id[1]
+        names = [i.id for i in self._records]
+        size = len(self._records)
+        dn_matrix = []
+        ds_matrix = []
+        for i in range(size):
+            dn_matrix.append([])
+            ds_matrix.append([])
+            for j in range(i + 1):
+                if i != j:
+                    dn, ds = cal_dn_ds(
+                        self._records[i],
+                        self._records[j],
+                        method=method,
+                        codon_table=codon_table,
+                    )
+                    dn_matrix[i].append(dn)
+                    ds_matrix[i].append(ds)
+                else:
+                    dn_matrix[i].append(0.0)
+                    ds_matrix[i].append(0.0)
+        dn_dm = DM(names, matrix=dn_matrix)
+        ds_dm = DM(names, matrix=ds_matrix)
+        return dn_dm, ds_dm
+
+    def get_dn_ds_tree(
+        self, dn_ds_method="NG86", tree_method="UPGMA", codon_table=None
+    ):
+        """Construct dn tree and ds tree.
+
+        Argument:
+         - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML.
+         - tree_method  - Available methods include UPGMA and NJ.
+
+        """
+        from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
+
+        if codon_table is None:
+            codon_table = CodonTable.generic_by_id[1]
+        dn_dm, ds_dm = self.get_dn_ds_matrix(
+            method=dn_ds_method, codon_table=codon_table
+        )
+        dn_constructor = DistanceTreeConstructor()
+        ds_constructor = DistanceTreeConstructor()
+        if tree_method == "UPGMA":
+            dn_tree = dn_constructor.upgma(dn_dm)
+            ds_tree = ds_constructor.upgma(ds_dm)
+        elif tree_method == "NJ":
+            dn_tree = dn_constructor.nj(dn_dm)
+            ds_tree = ds_constructor.nj(ds_dm)
+        else:
+            raise RuntimeError(
+                f"Unknown tree method ({tree_method})."
+                " Only NJ and UPGMA are accepted."
+            )
+        return dn_tree, ds_tree
+
+    @classmethod
+    def from_msa(cls, align):
+        """Convert a MultipleSeqAlignment to CodonAlignment.
+
+        Function to convert a MultipleSeqAlignment to CodonAlignment.
+        It is the user's responsibility to ensure all the requirement
+        needed by CodonAlignment is met.
+        """
+        rec = [SeqRecord(CodonSeq(str(i.seq)), id=i.id) for i in align._records]
+        return cls(rec)
+
+
+def mktest(codon_alns, codon_table=None, alpha=0.05):
+    """McDonald-Kreitman test for neutrality.
+
+    Implement the McDonald-Kreitman test for neutrality (PMID: 1904993)
+    This method counts changes rather than sites
+    (http://mkt.uab.es/mkt/help_mkt.asp).
+
+    Arguments:
+     - codon_alns  - list of CodonAlignment to compare (each
+       CodonAlignment object corresponds to gene sampled from a species)
+
+    Return the p-value of test result.
+    """
+    import copy
+
+    if codon_table is None:
+        codon_table = CodonTable.generic_by_id[1]
+    if not all(isinstance(i, CodonAlignment) for i in codon_alns):
+        raise TypeError("mktest accepts CodonAlignment list.")
+    codon_aln_len = [i.get_alignment_length() for i in codon_alns]
+    if len(set(codon_aln_len)) != 1:
+        raise RuntimeError(
+            "CodonAlignment object for mktest should be of equal length."
+        )
+    codon_num = codon_aln_len[0] // 3
+    # prepare codon_dict (taking stop codon as an extra amino acid)
+    codon_dict = copy.deepcopy(codon_table.forward_table)
+    for stop in codon_table.stop_codons:
+        codon_dict[stop] = "stop"
+    # prepare codon_lst
+    codon_lst = []
+    for codon_aln in codon_alns:
+        codon_lst.append([])
+        for i in codon_aln:
+            codon_lst[-1].append(_get_codon_list(i.seq))
+    codon_set = []
+    for i in range(codon_num):
+        uniq_codons = []
+        for j in codon_lst:
+            uniq_codon = {k[i] for k in j}
+            uniq_codons.append(uniq_codon)
+        codon_set.append(uniq_codons)
+    syn_fix, nonsyn_fix, syn_poly, nonsyn_poly = 0, 0, 0, 0
+    G, nonsyn_G = _get_codon2codon_matrix(codon_table=codon_table)
+    for i in codon_set:
+        all_codon = i[0].union(*i[1:])
+        if "-" in all_codon or len(all_codon) == 1:
+            continue
+        fix_or_not = all(len(k) == 1 for k in i)
+        if fix_or_not:
+            # fixed
+            nonsyn_subgraph = _get_subgraph(all_codon, nonsyn_G)
+            subgraph = _get_subgraph(all_codon, G)
+            this_non = _count_replacement(all_codon, nonsyn_subgraph)
+            this_syn = _count_replacement(all_codon, subgraph) - this_non
+            nonsyn_fix += this_non
+            syn_fix += this_syn
+        else:
+            # not fixed
+            nonsyn_subgraph = _get_subgraph(all_codon, nonsyn_G)
+            subgraph = _get_subgraph(all_codon, G)
+            this_non = _count_replacement(all_codon, nonsyn_subgraph)
+            this_syn = _count_replacement(all_codon, subgraph) - this_non
+            nonsyn_poly += this_non
+            syn_poly += this_syn
+    return _G_test([syn_fix, nonsyn_fix, syn_poly, nonsyn_poly])
+
+
+def _get_codon2codon_matrix(codon_table):
+    """Get codon codon substitution matrix (PRIVATE).
+
+    Elements in the matrix are number of synonymous and nonsynonymous
+    substitutions required for the substitution.
+    """
+    base_tuple = ("A", "T", "C", "G")
+    codons = [
+        i
+        for i in list(codon_table.forward_table.keys()) + codon_table.stop_codons
+        if "U" not in i
+    ]
+    # set up codon_dict considering stop codons
+    codon_dict = codon_table.forward_table
+    for stop in codon_table.stop_codons:
+        codon_dict[stop] = "stop"
+    # count site
+    num = len(codons)
+    G = {}  # graph for substitution
+    nonsyn_G = {}  # graph for nonsynonymous substitution
+    graph = {}
+    graph_nonsyn = {}
+    for i, codon in enumerate(codons):
+        graph[codon] = {}
+        graph_nonsyn[codon] = {}
+        for p, b in enumerate(codon):
+            for j in base_tuple:
+                tmp_codon = codon[0:p] + j + codon[p + 1 :]
+                if codon_dict[codon] != codon_dict[tmp_codon]:
+                    graph_nonsyn[codon][tmp_codon] = 1
+                    graph[codon][tmp_codon] = 1
+                else:
+                    if codon != tmp_codon:
+                        graph_nonsyn[codon][tmp_codon] = 0.1
+                        graph[codon][tmp_codon] = 1
+    for codon1 in codons:
+        nonsyn_G[codon1] = {}
+        G[codon1] = {}
+        for codon2 in codons:
+            if codon1 == codon2:
+                nonsyn_G[codon1][codon2] = 0
+                G[codon1][codon2] = 0
+            else:
+                nonsyn_G[codon1][codon2] = _dijkstra(graph_nonsyn, codon1, codon2)
+                G[codon1][codon2] = _dijkstra(graph, codon1, codon2)
+    return G, nonsyn_G
+
+
+def _dijkstra(graph, start, end):
+    """Dijkstra's algorithm Python implementation (PRIVATE).
+
+    Algorithm adapted from
+    http://thomas.pelletier.im/2010/02/dijkstras-algorithm-python-implementation/.
+    However, an obvious bug in::
+
+        if D[child_node] >(<) D[node] + child_value:
+
+    is fixed.
+    This function will return the distance between start and end.
+
+    Arguments:
+     - graph: Dictionary of dictionary (keys are vertices).
+     - start: Start vertex.
+     - end: End vertex.
+
+    Output:
+       List of vertices from the beginning to the end.
+
+    """
+    D = {}  # Final distances dict
+    P = {}  # Predecessor dict
+    # Fill the dicts with default values
+    for node in graph.keys():
+        D[node] = 100  # Vertices are unreachable
+        P[node] = ""  # Vertices have no predecessors
+    D[start] = 0  # The start vertex needs no move
+    unseen_nodes = list(graph.keys())  # All nodes are unseen
+    while len(unseen_nodes) > 0:
+        # Select the node with the lowest value in D (final distance)
+        shortest = None
+        node = ""
+        for temp_node in unseen_nodes:
+            if shortest is None:
+                shortest = D[temp_node]
+                node = temp_node
+            elif D[temp_node] < shortest:
+                shortest = D[temp_node]
+                node = temp_node
+        # Remove the selected node from unseen_nodes
+        unseen_nodes.remove(node)
+        # For each child (ie: connected vertex) of the current node
+        for child_node, child_value in graph[node].items():
+            if D[child_node] > D[node] + child_value:
+                D[child_node] = D[node] + child_value
+                # To go to child_node, you have to go through node
+                P[child_node] = node
+        if node == end:
+            break
+    # Set a clean path
+    path = []
+    # We begin from the end
+    node = end
+    distance = 0
+    # While we are not arrived at the beginning
+    while not (node == start):
+        if path.count(node) == 0:
+            path.insert(0, node)  # Insert the predecessor of the current node
+            node = P[node]  # The current node becomes its predecessor
+        else:
+            break
+    path.insert(0, start)  # Finally, insert the start vertex
+    for i in range(len(path) - 1):
+        distance += graph[path[i]][path[i + 1]]
+    return distance
+
+
+def _count_replacement(codon_set, G):
+    """Count replacement needed for a given codon_set (PRIVATE)."""
+    from math import floor
+
+    if len(codon_set) == 1:
+        return 0, 0
+    elif len(codon_set) == 2:
+        codons = list(codon_set)
+        return floor(G[codons[0]][codons[1]])
+    else:
+        codons = list(codon_set)
+        return _prim(G)
+
+
+def _prim(G):
+    """Prim's algorithm to find minimum spanning tree (PRIVATE).
+
+    Code is adapted from
+    http://programmingpraxis.com/2010/04/09/minimum-spanning-tree-prims-algorithm/
+    """
+    from math import floor
+    from collections import defaultdict
+    from heapq import heapify, heappop, heappush
+
+    nodes = []
+    edges = []
+    for i in G.keys():
+        nodes.append(i)
+        for j in G[i]:
+            if (i, j, G[i][j]) not in edges and (j, i, G[i][j]) not in edges:
+                edges.append((i, j, G[i][j]))
+    conn = defaultdict(list)
+    for n1, n2, c in edges:
+        conn[n1].append((c, n1, n2))
+        conn[n2].append((c, n2, n1))
+    mst = []  # minimum spanning tree
+    used = set(nodes[0])
+    usable_edges = conn[nodes[0]][:]
+    heapify(usable_edges)
+    while usable_edges:
+        cost, n1, n2 = heappop(usable_edges)
+        if n2 not in used:
+            used.add(n2)
+            mst.append((n1, n2, cost))
+            for e in conn[n2]:
+                if e[2] not in used:
+                    heappush(usable_edges, e)
+    length = 0
+    for p in mst:
+        length += floor(p[2])
+    return length
+
+
+def _get_subgraph(codons, G):
+    """Get the subgraph that contains all codons in list (PRIVATE)."""
+    subgraph = {}
+    for i in codons:
+        subgraph[i] = {}
+        for j in codons:
+            if i != j:
+                subgraph[i][j] = G[i][j]
+    return subgraph
+
+
+def _G_test(site_counts):
+    """G test for 2x2 contingency table (PRIVATE).
+
+    Arguments:
+     - site_counts - [syn_fix, nonsyn_fix, syn_poly, nonsyn_poly]
+
+    >>> print("%0.6f" % _G_test([17, 7, 42, 2]))
+    0.004924
+    """
+    # TODO:
+    #   Apply continuity correction for Chi-square test.
+    from math import log
+
+    # from scipy.stats import chi2
+    G = 0
+    tot = sum(site_counts)
+    tot_syn = site_counts[0] + site_counts[2]
+    tot_non = site_counts[1] + site_counts[3]
+    tot_fix = sum(site_counts[:2])
+    tot_poly = sum(site_counts[2:])
+    exp = [
+        tot_fix * tot_syn / tot,
+        tot_fix * tot_non / tot,
+        tot_poly * tot_syn / tot,
+        tot_poly * tot_non / tot,
+    ]
+    for obs, ex in zip(site_counts, exp):
+        G += obs * log(obs / ex)
+    G *= 2
+    # return 1-chi2.cdf(G, 1) # only 1 dof for 2x2 table
+    return chisqprob(G, 1)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/codonalign/codonseq.py b/code/lib/Bio/codonalign/codonseq.py
new file mode 100644
index 0000000..725355a
--- /dev/null
+++ b/code/lib/Bio/codonalign/codonseq.py
@@ -0,0 +1,1319 @@
+# Copyright 2013 by Zheng Ruan (zruan1991@gmail.com).
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+"""Code for dealing with coding sequence.
+
+CodonSeq class is inherited from Seq class. This is the core class to
+deal with sequences in CodonAlignment in biopython.
+
+"""
+from itertools import permutations
+from math import log
+
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Data import CodonTable
+
+
+class CodonSeq(Seq):
+    """CodonSeq is designed to be within the SeqRecords of a CodonAlignment class.
+
+    CodonSeq is useful as it allows the user to specify
+    reading frame when translate CodonSeq
+
+    CodonSeq also accepts codon style slice by calling
+    get_codon() method.
+
+    **Important:** Ungapped CodonSeq can be any length if you
+    specify the rf_table. Gapped CodonSeq should be a
+    multiple of three.
+
+    >>> codonseq = CodonSeq("AAATTTGGGCCAAATTT", rf_table=(0,3,6,8,11,14))
+    >>> print(codonseq.translate())
+    KFGAKF
+
+    test get_full_rf_table method
+
+    >>> p = CodonSeq('AAATTTCCCGG-TGGGTTTAA', rf_table=(0, 3, 6, 9, 11, 14, 17))
+    >>> full_rf_table = p.get_full_rf_table()
+    >>> print(full_rf_table)
+    [0, 3, 6, 9, 12, 15, 18]
+    >>> print(p.translate(rf_table=full_rf_table, ungap_seq=False))
+    KFPPWV*
+    >>> p = CodonSeq('AAATTTCCCGGGAA-TTTTAA', rf_table=(0, 3, 6, 9, 14, 17))
+    >>> print(p.get_full_rf_table())
+    [0, 3, 6, 9, 12.0, 15, 18]
+    >>> p = CodonSeq('AAA------------TAA', rf_table=(0, 3))
+    >>> print(p.get_full_rf_table())
+    [0, 3.0, 6.0, 9.0, 12.0, 15]
+
+    """
+
+    def __init__(self, data="", gap_char="-", rf_table=None):
+        """Initialize the class."""
+        # rf_table should be a tuple or list indicating the every
+        # codon position along the sequence. For example:
+        # sequence = 'AAATTTGGGCCAAATTT'
+        # rf_table = (0, 3, 6, 8, 11, 14)
+        # the translated protein sequences will be
+        # AAA TTT GGG GCC AAA TTT
+        #  K   F   G   A   K   F
+        # Notice: rf_table applies to ungapped sequence. If there
+        #   are gaps in the sequence, they will be discarded. This
+        #   feature ensures the rf_table is independent of where the
+        #   codon sequence appears in the alignment
+
+        Seq.__init__(self, data.upper())
+        self.gap_char = gap_char
+
+        # check the length of the alignment to be a triple
+        if rf_table is None:
+            length = len(self)
+            if length % 3 != 0:
+                raise ValueError(
+                    "Sequence length is not a multiple of "
+                    "three (i.e. a whole number of codons)"
+                )
+            self.rf_table = list(range(0, length - self.count(gap_char), 3))
+        else:
+            # if gap_char in self:
+            #    assert  len(self) % 3 == 0, \
+            #            "Gapped sequence length is not a triple number"
+            if not isinstance(rf_table, (tuple, list)):
+                raise TypeError("rf_table should be a tuple or list object")
+            if not all(isinstance(i, int) for i in rf_table):
+                raise TypeError(
+                    "Elements in rf_table should be int "
+                    "that specify the codon positions of "
+                    "the sequence"
+                )
+            self.rf_table = rf_table
+
+    def get_codon(self, index):
+        """Get the index codon from the sequence."""
+        if len({i % 3 for i in self.rf_table}) != 1:
+            raise RuntimeError(
+                "frameshift detected. CodonSeq object is not able to deal with "
+                "codon sequence with frameshift. Please use normal slice option."
+            )
+        if isinstance(index, int):
+            if index != -1:
+                return str(self[index * 3 : (index + 1) * 3])
+            else:
+                return str(self[index * 3 :])
+        else:
+            # This slice ensures that codon will always be the unit
+            # in slicing (it won't change to other codon if you are
+            # using reverse slicing such as [::-1]).
+            # The idea of the code below is to first map the slice
+            # to amino acid sequence and then transform it into
+            # codon sequence.
+            aa_index = range(len(self) // 3)
+
+            def cslice(p):
+                aa_slice = aa_index[p]
+                codon_slice = ""
+                for i in aa_slice:
+                    codon_slice += self[i * 3 : i * 3 + 3]
+                return str(codon_slice)
+
+            codon_slice = cslice(index)
+            return CodonSeq(codon_slice)
+
+    def get_codon_num(self):
+        """Return the number of codons in the CodonSeq."""
+        return len(self.rf_table)
+
+    def translate(
+        self, codon_table=None, stop_symbol="*", rf_table=None, ungap_seq=True
+    ):
+        """Translate the CodonSeq based on the reading frame in rf_table.
+
+        It is possible for the user to specify
+        a rf_table at this point. If you want to include
+        gaps in the translated sequence, this is the only
+        way. ungap_seq should be set to true for this
+        purpose.
+        """
+        if codon_table is None:
+            codon_table = CodonTable.generic_by_id[1]
+        amino_acids = []
+        if ungap_seq:
+            tr_seq = str(self).replace(self.gap_char, "")
+        else:
+            tr_seq = str(self)
+        if rf_table is None:
+            rf_table = self.rf_table
+        p = -1  # initiation
+        for i in rf_table:
+            if isinstance(i, float):
+                amino_acids.append("-")
+                continue
+            # elif '---' == tr_seq[i:i+3]:
+            #    amino_acids.append('-')
+            #    continue
+            elif "-" in tr_seq[i : i + 3]:
+                # considering two types of frameshift
+                if p == -1 or p - i == 3:
+                    p = i
+                    codon = tr_seq[i : i + 6].replace("-", "")[:3]
+                elif p - i > 3:
+                    codon = tr_seq[i : i + 3]
+                    p = i
+            else:
+                # normal condition without gaps
+                codon = tr_seq[i : i + 3]
+                p = i
+            if codon in codon_table.stop_codons:
+                amino_acids.append(stop_symbol)
+                continue
+            try:
+                amino_acids.append(codon_table.forward_table[codon])
+            except KeyError:
+                raise RuntimeError(
+                    f"Unknown codon detected ({codon}). Did you"
+                    " forget to specify the ungap_seq argument?"
+                )
+        return "".join(amino_acids)
+
+    def toSeq(self):
+        """Convert DNA to seq object."""
+        return Seq(str(self))
+
+    def get_full_rf_table(self):
+        """Return full rf_table of the CodonSeq records.
+
+        A full rf_table is different from a normal rf_table in that
+        it translate gaps in CodonSeq. It is helpful to construct
+        alignment containing frameshift.
+        """
+        ungap_seq = str(self).replace("-", "")
+        relative_pos = [self.rf_table[0]]
+        for i in range(1, len(self.rf_table[1:]) + 1):
+            relative_pos.append(self.rf_table[i] - self.rf_table[i - 1])
+        full_rf_table = []
+        codon_num = 0
+        for i in range(0, len(self), 3):
+            if self[i : i + 3] == self.gap_char * 3:
+                full_rf_table.append(i + 0.0)
+            elif relative_pos[codon_num] == 0:
+                full_rf_table.append(i)
+                codon_num += 1
+            elif relative_pos[codon_num] in (-1, -2):
+                # check the gap status of previous codon
+                gap_stat = 3 - self.count("-", i - 3, i)
+                if gap_stat == 3:
+                    full_rf_table.append(i + relative_pos[codon_num])
+                elif gap_stat == 2:
+                    full_rf_table.append(i + 1 + relative_pos[codon_num])
+                elif gap_stat == 1:
+                    full_rf_table.append(i + 2 + relative_pos[codon_num])
+                codon_num += 1
+            elif relative_pos[codon_num] > 0:
+                full_rf_table.append(i + 0.0)
+            try:
+                this_len = 3 - self.count("-", i, i + 3)
+                relative_pos[codon_num] -= this_len
+            except Exception:  # TODO: IndexError?
+                # we probably reached the last codon
+                pass
+        return full_rf_table
+
+    def full_translate(self, codon_table=None, stop_symbol="*"):
+        """Apply full translation with gaps considered."""
+        if codon_table is None:
+            codon_table = CodonTable.generic_by_id[1]
+        full_rf_table = self.get_full_rf_table()
+        return self.translate(
+            codon_table=codon_table,
+            stop_symbol=stop_symbol,
+            rf_table=full_rf_table,
+            ungap_seq=False,
+        )
+
+    def ungap(self, gap="-"):
+        """Return a copy of the sequence without the gap character(s)."""
+        if len(gap) != 1 or not isinstance(gap, str):
+            raise ValueError("Unexpected gap character, %s" % repr(gap))
+        return CodonSeq(str(self).replace(gap, ""), rf_table=self.rf_table)
+
+    @classmethod
+    def from_seq(cls, seq, rf_table=None):
+        """Get codon sequence from sequence data."""
+        if rf_table is None:
+            return cls(str(seq))
+        else:
+            return cls(str(seq), rf_table=rf_table)
+
+
+def _get_codon_list(codonseq):
+    """List of codons according to full_rf_table for counting (PRIVATE)."""
+    # if not isinstance(codonseq, CodonSeq):
+    #    raise TypeError("_get_codon_list accept a CodonSeq object "
+    #                    "({0} detected)".format(type(codonseq)))
+    full_rf_table = codonseq.get_full_rf_table()
+    codon_lst = []
+    for i, k in enumerate(full_rf_table):
+        if isinstance(k, int):
+            start = k
+            try:
+                end = int(full_rf_table[i + 1])
+            except IndexError:
+                end = start + 3
+            this_codon = str(codonseq[start:end])
+            if len(this_codon) == 3:
+                codon_lst.append(this_codon)
+            else:
+                codon_lst.append(str(this_codon.ungap()))
+        elif str(codonseq[int(k) : int(k) + 3]) == "---":
+            codon_lst.append("---")
+        else:
+            # this may be problematic, as normally no codon should
+            # fall into this condition
+            codon_lst.append(codonseq[int(k) : int(k) + 3])
+    return codon_lst
+
+
+def cal_dn_ds(
+    codon_seq1, codon_seq2, method="NG86", codon_table=None, k=1, cfreq=None,
+):
+    """Calculate dN and dS of the given two sequences.
+
+    Available methods:
+        - NG86  - `Nei and Gojobori (1986)`_ (PMID 3444411).
+        - LWL85 - `Li et al. (1985)`_ (PMID 3916709).
+        - ML    - `Goldman and Yang (1994)`_ (PMID 7968486).
+        - YN00  - `Yang and Nielsen (2000)`_ (PMID 10666704).
+
+    .. _`Nei and Gojobori (1986)`: http://www.ncbi.nlm.nih.gov/pubmed/3444411
+    .. _`Li et al. (1985)`: http://www.ncbi.nlm.nih.gov/pubmed/3916709
+    .. _`Goldman and Yang (1994)`: http://mbe.oxfordjournals.org/content/11/5/725
+    .. _`Yang and Nielsen (2000)`: https://doi.org/10.1093/oxfordjournals.molbev.a026236
+
+    Arguments:
+     - codon_seq1 - CodonSeq or or SeqRecord that contains a CodonSeq
+     - codon_seq2 - CodonSeq or or SeqRecord that contains a CodonSeq
+     - w  - transition/transversion ratio
+     - cfreq - Current codon frequency vector can only be specified
+       when you are using ML method. Possible ways of
+       getting cfreq are: F1x4, F3x4 and F61.
+
+    """
+    if isinstance(codon_seq1, CodonSeq) and isinstance(codon_seq2, CodonSeq):
+        pass
+    elif isinstance(codon_seq1, SeqRecord) and isinstance(codon_seq2, SeqRecord):
+        codon_seq1 = codon_seq1.seq
+        codon_seq2 = codon_seq2.seq
+    else:
+        raise TypeError(
+            "cal_dn_ds accepts two CodonSeq objects or SeqRecord "
+            "that contains CodonSeq as its seq!"
+        )
+    if len(codon_seq1.get_full_rf_table()) != len(codon_seq2.get_full_rf_table()):
+        raise RuntimeError(
+            f"full_rf_table length of seq1 ({len(codon_seq1.get_full_rf_table())})"
+            f" and seq2 ({len(codon_seq2.get_full_rf_table())}) are not the same"
+        )
+    if cfreq is None:
+        cfreq = "F3x4"
+    elif cfreq is not None and method != "ML":
+        raise RuntimeError("cfreq can only be specified when you are using ML method")
+    if cfreq not in ("F1x4", "F3x4", "F61"):
+        import warnings
+
+        warnings.warn(
+            f"Unknown cfreq ({cfreq}). "
+            "Only F1x4, F3x4 and F61 are acceptable. Used F3x4 in the following."
+        )
+        cfreq = "F3x4"
+    if codon_table is None:
+        codon_table = CodonTable.generic_by_id[1]
+    seq1_codon_lst = _get_codon_list(codon_seq1)
+    seq2_codon_lst = _get_codon_list(codon_seq2)
+    # remove gaps in seq_codon_lst
+    seq1 = []
+    seq2 = []
+    for i, j in zip(seq1_codon_lst, seq2_codon_lst):
+        if ("-" not in i) and ("-" not in j):
+            seq1.append(i)
+            seq2.append(j)
+    dnds_func = {"ML": _ml, "NG86": _ng86, "LWL85": _lwl85, "YN00": _yn00}
+    if method == "ML":
+        return dnds_func[method](seq1, seq2, cfreq, codon_table)
+    else:
+        return dnds_func[method](seq1, seq2, k, codon_table)
+
+
+#################################################################
+#              private functions for NG86 method
+#################################################################
+
+
+def _ng86(seq1, seq2, k, codon_table):
+    """NG86 method main function (PRIVATE)."""
+    S_sites1, N_sites1 = _count_site_NG86(seq1, codon_table=codon_table, k=k)
+    S_sites2, N_sites2 = _count_site_NG86(seq2, codon_table=codon_table, k=k)
+    S_sites = (S_sites1 + S_sites2) / 2.0
+    N_sites = (N_sites1 + N_sites2) / 2.0
+    SN = [0, 0]
+    for i, j in zip(seq1, seq2):
+        SN = [
+            m + n for m, n in zip(SN, _count_diff_NG86(i, j, codon_table=codon_table))
+        ]
+
+    ps = SN[0] / S_sites
+    pn = SN[1] / N_sites
+    if ps < 3 / 4:
+        dS = abs(-3.0 / 4 * log(1 - 4.0 / 3 * ps))
+    else:
+        dS = -1
+    if pn < 3 / 4:
+        dN = abs(-3.0 / 4 * log(1 - 4.0 / 3 * pn))
+    else:
+        dN = -1
+    return dN, dS
+
+
+def _count_site_NG86(codon_lst, codon_table, k=1):
+    """Count synonymous and non-synonymous sites of a list of codons (PRIVATE).
+
+    Arguments:
+     - codon_lst - A three letter codon list from a CodonSeq object.
+       This can be returned from _get_codon_list method.
+     - k - transition/transversion rate ratio.
+
+    """
+    S_site = 0  # synonymous sites
+    N_site = 0  # non-synonymous sites
+    purine = ("A", "G")
+    pyrimidine = ("T", "C")
+    base_tuple = ("A", "T", "C", "G")
+    for codon in codon_lst:
+        neighbor_codon = {"transition": [], "transversion": []}
+        # classify neighbor codons
+        codon = codon.replace("U", "T")
+        if codon == "---":
+            continue
+        for n, i in enumerate(codon):
+            for j in base_tuple:
+                if i == j:
+                    pass
+                elif i in purine and j in purine:
+                    codon_chars = list(codon)
+                    codon_chars[n] = j
+                    this_codon = "".join(codon_chars)
+                    neighbor_codon["transition"].append(this_codon)
+                elif i in pyrimidine and j in pyrimidine:
+                    codon_chars = list(codon)
+                    codon_chars[n] = j
+                    this_codon = "".join(codon_chars)
+                    neighbor_codon["transition"].append(this_codon)
+                else:
+                    codon_chars = list(codon)
+                    codon_chars[n] = j
+                    this_codon = "".join(codon_chars)
+                    neighbor_codon["transversion"].append(this_codon)
+        # count synonymous and non-synonymous sites
+        aa = codon_table.forward_table[codon]
+        this_codon_N_site = this_codon_S_site = 0
+        for neighbor in neighbor_codon["transition"]:
+            if neighbor in codon_table.stop_codons:
+                this_codon_N_site += 1
+            elif codon_table.forward_table[neighbor] == aa:
+                this_codon_S_site += 1
+            else:
+                this_codon_N_site += 1
+        for neighbor in neighbor_codon["transversion"]:
+            if neighbor in codon_table.stop_codons:
+                this_codon_N_site += k
+            elif codon_table.forward_table[neighbor] == aa:
+                this_codon_S_site += k
+            else:
+                this_codon_N_site += k
+        norm_const = (this_codon_N_site + this_codon_S_site) / 3
+        S_site += this_codon_S_site / norm_const
+        N_site += this_codon_N_site / norm_const
+    return (S_site, N_site)
+
+
+def _count_diff_NG86(codon1, codon2, codon_table):
+    """Count differences between two codons, three-letter string (PRIVATE).
+
+    The function will take multiple pathways from codon1 to codon2
+    into account.
+    """
+    if not isinstance(codon1, str) or not isinstance(codon2, str):
+        raise TypeError(
+            "_count_diff_NG86 accepts string object to represent codon"
+            f" ({type(codon1)}, {type(codon2)} detected)"
+        )
+    if len(codon1) != 3 or len(codon2) != 3:
+        raise RuntimeError(
+            "codon should be three letter string"
+            f" ({len(codon1)}, {len(codon2)} detected)"
+        )
+    SN = [0, 0]  # synonymous and nonsynonymous counts
+    if codon1 == "---" or codon2 == "---":
+        return SN
+    base_tuple = ("A", "C", "G", "T")
+    if not all(i in base_tuple for i in codon1):
+        raise RuntimeError(
+            f"Unrecognized character detected in codon1 {codon1}"
+            " (Codons consist of A, T, C or G)"
+        )
+    if not all(i in base_tuple for i in codon2):
+        raise RuntimeError(
+            f"Unrecognized character detected in codon2 {codon2}"
+            " (Codons consist of A, T, C or G)"
+        )
+    if codon1 == codon2:
+        return SN
+    else:
+        diff_pos = []
+        for i, k in enumerate(zip(codon1, codon2)):
+            if k[0] != k[1]:
+                diff_pos.append(i)
+
+        def compare_codon(codon1, codon2, codon_table, weight=1):
+            """Compare two codon accounting for different pathways."""
+            sd = nd = 0
+            if len(set(map(codon_table.forward_table.get, [codon1, codon2]))) == 1:
+                sd += weight
+            else:
+                nd += weight
+            return (sd, nd)
+
+        if len(diff_pos) == 1:
+            SN = [
+                i + j
+                for i, j in zip(
+                    SN, compare_codon(codon1, codon2, codon_table=codon_table)
+                )
+            ]
+        elif len(diff_pos) == 2:
+            for i in diff_pos:
+                temp_codon = codon1[:i] + codon2[i] + codon1[i + 1 :]
+                SN = [
+                    i + j
+                    for i, j in zip(
+                        SN,
+                        compare_codon(
+                            codon1, temp_codon, codon_table=codon_table, weight=0.5
+                        ),
+                    )
+                ]
+                SN = [
+                    i + j
+                    for i, j in zip(
+                        SN,
+                        compare_codon(
+                            temp_codon, codon2, codon_table=codon_table, weight=0.5
+                        ),
+                    )
+                ]
+        elif len(diff_pos) == 3:
+            paths = list(permutations([0, 1, 2], 3))
+            tmp_codon = []
+            for p in paths:
+                tmp1 = codon1[: p[0]] + codon2[p[0]] + codon1[p[0] + 1 :]
+                tmp2 = tmp1[: p[1]] + codon2[p[1]] + tmp1[p[1] + 1 :]
+                tmp_codon.append((tmp1, tmp2))
+                SN = [
+                    i + j
+                    for i, j in zip(
+                        SN, compare_codon(codon1, tmp1, codon_table, weight=0.5 / 3)
+                    )
+                ]
+                SN = [
+                    i + j
+                    for i, j in zip(
+                        SN, compare_codon(tmp1, tmp2, codon_table, weight=0.5 / 3)
+                    )
+                ]
+                SN = [
+                    i + j
+                    for i, j in zip(
+                        SN, compare_codon(tmp2, codon2, codon_table, weight=0.5 / 3)
+                    )
+                ]
+    return SN
+
+
+#################################################################
+#               private functions for LWL85 method
+#################################################################
+
+
+def _lwl85(seq1, seq2, k, codon_table):
+    """LWL85 method main function (PRIVATE).
+
+    Nomenclature is according to Li et al. (1985), PMID 3916709.
+    """
+    codon_fold_dict = _get_codon_fold(codon_table)
+    # count number of sites in different degenerate classes
+    fold0 = [0, 0]
+    fold2 = [0, 0]
+    fold4 = [0, 0]
+    for codon in seq1 + seq2:
+        fold_num = codon_fold_dict[codon]
+        for f in fold_num:
+            if f == "0":
+                fold0[0] += 1
+            elif f == "2":
+                fold2[0] += 1
+            elif f == "4":
+                fold4[0] += 1
+    L = [sum(fold0) / 2.0, sum(fold2) / 2.0, sum(fold4) / 2.0]
+    # count number of differences in different degenerate classes
+    PQ = [0] * 6  # with P0, P2, P4, Q0, Q2, Q4 in each position
+    for codon1, codon2 in zip(seq1, seq2):
+        if (codon1 == "---" or codon2 == "---") or codon1 == codon2:
+            continue
+        else:
+            PQ = [
+                i + j
+                for i, j in zip(
+                    PQ, _diff_codon(codon1, codon2, fold_dict=codon_fold_dict)
+                )
+            ]
+    PQ = [i / j for i, j in zip(PQ, L * 2)]
+    P = PQ[:3]
+    Q = PQ[3:]
+    A = [
+        (1.0 / 2) * log(1.0 / (1 - 2 * i - j)) - (1.0 / 4) * log(1.0 / (1 - 2 * j))
+        for i, j in zip(P, Q)
+    ]
+    B = [(1.0 / 2) * log(1.0 / (1 - 2 * i)) for i in Q]
+    dS = 3 * (L[2] * A[1] + L[2] * (A[2] + B[2])) / (L[1] + 3 * L[2])
+    dN = 3 * (L[2] * B[1] + L[0] * (A[0] + B[0])) / (2 * L[1] + 3 * L[0])
+    return dN, dS
+
+
+def _get_codon_fold(codon_table):
+    """Classify different position in a codon into different folds (PRIVATE)."""
+
+    def find_fold_class(codon, forward_table):
+        base = {"A", "T", "C", "G"}
+        fold = ""
+        codon_base_lst = list(codon)
+        for i, b in enumerate(codon_base_lst):
+            other_base = base - set(b)
+            aa = []
+            for j in other_base:
+                codon_base_lst[i] = j
+                try:
+                    aa.append(forward_table["".join(codon_base_lst)])
+                except KeyError:
+                    aa.append("stop")
+            if aa.count(forward_table[codon]) == 0:
+                fold += "0"
+            elif aa.count(forward_table[codon]) in (1, 2):
+                fold += "2"
+            elif aa.count(forward_table[codon]) == 3:
+                fold += "4"
+            else:
+                raise RuntimeError(
+                    "Unknown Error, cannot assign the position to a fold"
+                )
+            codon_base_lst[i] = b
+        return fold
+
+    fold_table = {}
+    for codon in codon_table.forward_table:
+        if "U" not in codon:
+            fold_table[codon] = find_fold_class(codon, codon_table.forward_table)
+    fold_table["---"] = "---"
+    return fold_table
+
+
+def _diff_codon(codon1, codon2, fold_dict):
+    """Count number of different substitution types between two codons (PRIVATE).
+
+    returns tuple (P0, P2, P4, Q0, Q2, Q4)
+
+    Nomenclature is according to Li et al. (1958), PMID 3916709.
+    """
+    P0 = P2 = P4 = Q0 = Q2 = Q4 = 0
+    fold_num = fold_dict[codon1]
+    purine = ("A", "G")
+    pyrimidine = ("T", "C")
+    for n, (i, j) in enumerate(zip(codon1, codon2)):
+        if i != j and (i in purine and j in purine):
+            if fold_num[n] == "0":
+                P0 += 1
+            elif fold_num[n] == "2":
+                P2 += 1
+            elif fold_num[n] == "4":
+                P4 += 1
+            else:
+                raise RuntimeError("Unexpected fold_num %d" % fold_num[n])
+        if i != j and (i in pyrimidine and j in pyrimidine):
+            if fold_num[n] == "0":
+                P0 += 1
+            elif fold_num[n] == "2":
+                P2 += 1
+            elif fold_num[n] == "4":
+                P4 += 1
+            else:
+                raise RuntimeError("Unexpected fold_num %d" % fold_num[n])
+        if i != j and (
+            (i in purine and j in pyrimidine) or (i in pyrimidine and j in purine)
+        ):
+            if fold_num[n] == "0":
+                Q0 += 1
+            elif fold_num[n] == "2":
+                Q2 += 1
+            elif fold_num[n] == "4":
+                Q4 += 1
+            else:
+                raise RuntimeError("Unexpected fold_num %d" % fold_num[n])
+    return (P0, P2, P4, Q0, Q2, Q4)
+
+
+#################################################################
+#               private functions for YN00 method
+#################################################################
+
+
+def _yn00(seq1, seq2, k, codon_table):
+    """YN00 method main function (PRIVATE).
+
+    Nomenclature is according to Yang and Nielsen (2000), PMID 10666704.
+    """
+    from collections import defaultdict
+    from scipy.linalg import expm
+
+    fcodon = [
+        {"A": 0, "G": 0, "C": 0, "T": 0},
+        {"A": 0, "G": 0, "C": 0, "T": 0},
+        {"A": 0, "G": 0, "C": 0, "T": 0},
+    ]
+    codon_fold_dict = _get_codon_fold(codon_table)
+    fold0_cnt = defaultdict(int)
+    fold4_cnt = defaultdict(int)
+    for codon in seq1 + seq2:
+        # count sites at different codon position
+        if codon != "---":
+            fcodon[0][codon[0]] += 1
+            fcodon[1][codon[1]] += 1
+            fcodon[2][codon[2]] += 1
+        # count sites in different degenerate fold class
+        fold_num = codon_fold_dict[codon]
+        for i, f in enumerate(fold_num):
+            if f == "0":
+                fold0_cnt[codon[i]] += 1
+            elif f == "4":
+                fold4_cnt[codon[i]] += 1
+    f0_total = sum(fold0_cnt.values())
+    f4_total = sum(fold4_cnt.values())
+    for i, j in zip(fold0_cnt, fold4_cnt):
+        fold0_cnt[i] = fold0_cnt[i] / f0_total
+        fold4_cnt[i] = fold4_cnt[i] / f4_total
+    # TODO:
+    # the initial kappa is different from what yn00 gives,
+    # try to find the problem.
+    TV = _get_TV(seq1, seq2, codon_table=codon_table)
+    k04 = (_get_kappa_t(fold0_cnt, TV), _get_kappa_t(fold4_cnt, TV))
+    kappa = (f0_total * k04[0] + f4_total * k04[1]) / (f0_total + f4_total)
+    # kappa = 2.4285
+    # count synonymous sites and non-synonymous sites
+    for i in range(3):
+        tot = sum(fcodon[i].values())
+        fcodon[i] = {j: k / tot for j, k in fcodon[i].items()}
+    pi = defaultdict(int)
+    for i in list(codon_table.forward_table.keys()) + codon_table.stop_codons:
+        if "U" not in i:
+            pi[i] = 0
+    for i in seq1 + seq2:
+        pi[i] += 1
+    S_sites1, N_sites1, bfreqSN1 = _count_site_YN00(
+        seq1, seq2, pi, k=kappa, codon_table=codon_table
+    )
+    S_sites2, N_sites2, bfreqSN2 = _count_site_YN00(
+        seq2, seq1, pi, k=kappa, codon_table=codon_table
+    )
+    N_sites = (N_sites1 + N_sites2) / 2
+    S_sites = (S_sites1 + S_sites2) / 2
+    bfreqSN = [{"A": 0, "T": 0, "C": 0, "G": 0}, {"A": 0, "T": 0, "C": 0, "G": 0}]
+    for i in range(2):
+        for b in ("A", "T", "C", "G"):
+            bfreqSN[i][b] = (bfreqSN1[i][b] + bfreqSN2[i][b]) / 2
+    # use NG86 method to get initial t and w
+    SN = [0, 0]
+    for i, j in zip(seq1, seq2):
+        SN = [
+            m + n for m, n in zip(SN, _count_diff_NG86(i, j, codon_table=codon_table))
+        ]
+    ps = SN[0] / S_sites
+    pn = SN[1] / N_sites
+    p = sum(SN) / (S_sites + N_sites)
+    w = log(1 - 4.0 / 3 * pn) / log(1 - 4.0 / 3 * ps)
+    t = -3 / 4 * log(1 - 4 / 3 * p)
+    tolerance = 1e-5
+    dSdN_pre = [0, 0]
+    for temp in range(20):
+        # count synonymous and nonsynonymous differences under kappa, w, t
+        codon_lst = [
+            i
+            for i in list(codon_table.forward_table.keys()) + codon_table.stop_codons
+            if "U" not in i
+        ]
+        Q = _get_Q(pi, kappa, w, codon_lst, codon_table)
+        P = expm(Q * t)
+        TV = [0, 0, 0, 0]  # synonymous/nonsynonymous transition/transversion
+        codon_npath = {}
+        for i, j in zip(seq1, seq2):
+            if i != "---" and j != "---":
+                codon_npath.setdefault((i, j), 0)
+                codon_npath[(i, j)] += 1
+        for i in codon_npath:
+            tv = _count_diff_YN00(i[0], i[1], P, codon_lst, codon_table)
+            TV = [m + n * codon_npath[i] for m, n in zip(TV, tv)]
+        TV = (TV[0] / S_sites, TV[1] / S_sites), (TV[2] / N_sites, TV[3] / N_sites)
+        # according to the DistanceF84() function of yn00.c in paml,
+        # the t (e.q. 10) appears in PMID: 10666704 is dS and dN
+        dSdN = []
+        for f, tv in zip(bfreqSN, TV):
+            dSdN.append(_get_kappa_t(f, tv, t=True))
+        t = dSdN[0] * 3 * S_sites / (S_sites + N_sites) + dSdN[1] * 3 * N_sites / (
+            S_sites + N_sites
+        )
+        w = dSdN[1] / dSdN[0]
+        if all(
+            map(lambda x: x < tolerance, (abs(i - j) for i, j in zip(dSdN, dSdN_pre)))
+        ):
+            return dSdN[1], dSdN[0]  # dN, dS
+        dSdN_pre = dSdN
+
+
+def _get_TV(codon_lst1, codon_lst2, codon_table):
+    """Get TV (PRIVATE).
+
+    Arguments:
+     - T - proportions of transitional differences
+     - V - proportions of transversional differences
+
+    """
+    purine = ("A", "G")
+    pyrimidine = ("C", "T")
+    TV = [0, 0]
+    sites = 0
+    for codon1, codon2 in zip(codon_lst1, codon_lst2):
+        if "---" not in (codon1, codon2):
+            for i, j in zip(codon1, codon2):
+                if i == j:
+                    pass
+                elif i in purine and j in purine:
+                    TV[0] += 1
+                elif i in pyrimidine and j in pyrimidine:
+                    TV[0] += 1
+                else:
+                    TV[1] += 1
+                sites += 1
+    return (TV[0] / sites, TV[1] / sites)
+    # return (TV[0], TV[1])
+
+
+def _get_kappa_t(pi, TV, t=False):
+    """Calculate kappa (PRIVATE).
+
+    The following formula and variable names are according to PMID: 10666704
+    """
+    pi["Y"] = pi["T"] + pi["C"]
+    pi["R"] = pi["A"] + pi["G"]
+    A = (
+        2 * (pi["T"] * pi["C"] + pi["A"] * pi["G"])
+        + 2
+        * (
+            pi["T"] * pi["C"] * pi["R"] / pi["Y"]
+            + pi["A"] * pi["G"] * pi["Y"] / pi["R"]
+        )
+        * (1 - TV[1] / (2 * pi["Y"] * pi["R"]))
+        - TV[0]
+    ) / (2 * (pi["T"] * pi["C"] / pi["Y"] + pi["A"] * pi["G"] / pi["R"]))
+    B = 1 - TV[1] / (2 * pi["Y"] * pi["R"])
+    a = -0.5 * log(A)  # this seems to be an error in YANG's original paper
+    b = -0.5 * log(B)
+    kappaF84 = a / b - 1
+    if t is False:
+        kappaHKY85 = 1 + (
+            pi["T"] * pi["C"] / pi["Y"] + pi["A"] * pi["G"] / pi["R"]
+        ) * kappaF84 / (pi["T"] * pi["C"] + pi["A"] * pi["G"])
+        return kappaHKY85
+    else:
+        t = (
+            4 * pi["T"] * pi["C"] * (1 + kappaF84 / pi["Y"])
+            + 4 * pi["A"] * pi["G"] * (1 + kappaF84 / pi["R"])
+            + 4 * pi["Y"] * pi["R"]
+        ) * b
+        return t
+
+
+def _count_site_YN00(codon_lst1, codon_lst2, pi, k, codon_table):
+    """Site counting method from Ina / Yang and Nielsen (PRIVATE).
+
+    Method from `Ina (1995)`_ as modified by `Yang and Nielsen (2000)`_.
+    This will return the total number of synonymous and nonsynonymous sites
+    and base frequencies in each category. The function is equivalent to
+    the ``CountSites()`` function in ``yn00.c`` of PAML.
+
+    .. _`Ina (1995)`: https://doi.org/10.1007/BF00167113
+    .. _`Yang and Nielsen (2000)`: https://doi.org/10.1093/oxfordjournals.molbev.a026236
+
+    """
+    if len(codon_lst1) != len(codon_lst2):
+        raise RuntimeError(
+            "Length of two codon_lst should be the same (%d and %d detected)"
+            % (len(codon_lst1), len(codon_lst2))
+        )
+    else:
+        length = len(codon_lst1)
+    purine = ("A", "G")
+    pyrimidine = ("T", "C")
+    base_tuple = ("A", "T", "C", "G")
+    codon_dict = codon_table.forward_table
+    stop = codon_table.stop_codons
+    codon_npath = {}
+    for i, j in zip(codon_lst1, codon_lst2):
+        if i != "---" and j != "---":
+            codon_npath.setdefault((i, j), 0)
+            codon_npath[(i, j)] += 1
+    S_sites = N_sites = 0
+    freqSN = [
+        {"A": 0, "T": 0, "C": 0, "G": 0},  # synonymous
+        {"A": 0, "T": 0, "C": 0, "G": 0},
+    ]  # nonsynonymous
+    for codon_pair, npath in codon_npath.items():
+        codon = codon_pair[0]
+        S = N = 0
+        for pos in range(3):
+            for base in base_tuple:
+                if codon[pos] == base:
+                    continue
+                neighbor_codon = codon[:pos] + base + codon[pos + 1 :]
+                if neighbor_codon in stop:
+                    continue
+                weight = pi[neighbor_codon]
+                if codon[pos] in pyrimidine and base in pyrimidine:
+                    weight *= k
+                elif codon[pos] in purine and base in purine:
+                    weight *= k
+                if codon_dict[codon] == codon_dict[neighbor_codon]:
+                    S += weight
+                    freqSN[0][base] += weight * npath
+                else:
+                    N += weight
+                    freqSN[1][base] += weight * npath
+        S_sites += S * npath
+        N_sites += N * npath
+    norm_const = 3 * length / (S_sites + N_sites)
+    S_sites *= norm_const
+    N_sites *= norm_const
+    for i in freqSN:
+        norm_const = sum(i.values())
+        for b in i:
+            i[b] /= norm_const
+    return S_sites, N_sites, freqSN
+
+
+def _count_diff_YN00(codon1, codon2, P, codon_lst, codon_table):
+    """Count differences between two codons (three-letter string; PRIVATE).
+
+    The function will weighted multiple pathways from codon1 to codon2
+    according to P matrix of codon substitution. The proportion
+    of transition and transversion (TV) will also be calculated in
+    the function.
+    """
+    if not isinstance(codon1, str) or not isinstance(codon2, str):
+        raise TypeError(
+            "_count_diff_YN00 accepts string object to represent codon"
+            f" ({type(codon1)}, {type(codon2)} detected)"
+        )
+    if len(codon1) != 3 or len(codon2) != 3:
+        raise RuntimeError(
+            "codon should be three letter string"
+            f" ({len(codon1)}, {len(codon2)} detected)"
+        )
+    TV = [
+        0,
+        0,
+        0,
+        0,
+    ]  # transition and transversion counts (synonymous and nonsynonymous)
+    if codon1 == "---" or codon2 == "---":
+        return TV
+    base_tuple = ("A", "C", "G", "T")
+    if not all(i in base_tuple for i in codon1):
+        raise RuntimeError(
+            f"Unrecognized character detected in codon1 {codon1}"
+            " (Codons consist of A, T, C or G)"
+        )
+    if not all(i in base_tuple for i in codon2):
+        raise RuntimeError(
+            f"Unrecognized character detected in codon2 {codon2}"
+            " (Codons consist of A, T, C or G)"
+        )
+    if codon1 == codon2:
+        return TV
+    else:
+        diff_pos = []
+        for i, k in enumerate(zip(codon1, codon2)):
+            if k[0] != k[1]:
+                diff_pos.append(i)
+
+        def count_TV(codon1, codon2, diff, codon_table, weight=1):
+            purine = ("A", "G")
+            pyrimidine = ("T", "C")
+            dic = codon_table.forward_table
+            stop = codon_table.stop_codons
+            if codon1 in stop or codon2 in stop:
+                # stop codon is always considered as nonsynonymous
+                if codon1[diff] in purine and codon2[diff] in purine:
+                    return [0, 0, weight, 0]
+                elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:
+                    return [0, 0, weight, 0]
+                else:
+                    return [0, 0, 0, weight]
+            elif dic[codon1] == dic[codon2]:
+                if codon1[diff] in purine and codon2[diff] in purine:
+                    return [weight, 0, 0, 0]
+                elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:
+                    return [weight, 0, 0, 0]
+                else:
+                    return [0, weight, 0, 0]
+            else:
+                if codon1[diff] in purine and codon2[diff] in purine:
+                    return [0, 0, weight, 0]
+                elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:
+                    return [0, 0, weight, 0]
+                else:
+                    return [0, 0, 0, weight]
+
+        if len(diff_pos) == 1:
+            TV = [
+                p + q
+                for p, q in zip(TV, count_TV(codon1, codon2, diff_pos[0], codon_table))
+            ]
+        elif len(diff_pos) == 2:
+            tmp_codon = [codon1[:i] + codon2[i] + codon1[i + 1 :] for i in diff_pos]
+            path_prob = []
+            for i in tmp_codon:
+                codon_idx = list(map(codon_lst.index, [codon1, i, codon2]))
+                prob = (P[codon_idx[0], codon_idx[1]], P[codon_idx[1], codon_idx[2]])
+                path_prob.append(prob[0] * prob[1])
+            path_prob = [2 * i / sum(path_prob) for i in path_prob]
+            for n, i in enumerate(diff_pos):
+                temp_codon = codon1[:i] + codon2[i] + codon1[i + 1 :]
+                TV = [
+                    p + q
+                    for p, q in zip(
+                        TV,
+                        count_TV(
+                            codon1, temp_codon, i, codon_table, weight=path_prob[n] / 2
+                        ),
+                    )
+                ]
+                TV = [
+                    p + q
+                    for p, q in zip(
+                        TV,
+                        count_TV(
+                            codon1, temp_codon, i, codon_table, weight=path_prob[n] / 2
+                        ),
+                    )
+                ]
+        elif len(diff_pos) == 3:
+            paths = list(permutations([0, 1, 2], 3))
+            path_prob = []
+            tmp_codon = []
+            for p in paths:
+                tmp1 = codon1[: p[0]] + codon2[p[0]] + codon1[p[0] + 1 :]
+                tmp2 = tmp1[: p[1]] + codon2[p[1]] + tmp1[p[1] + 1 :]
+                tmp_codon.append((tmp1, tmp2))
+                codon_idx = list(map(codon_lst.index, [codon1, tmp1, tmp2, codon2]))
+                prob = (
+                    P[codon_idx[0], codon_idx[1]],
+                    P[codon_idx[1], codon_idx[2]],
+                    P[codon_idx[2], codon_idx[3]],
+                )
+                path_prob.append(prob[0] * prob[1] * prob[2])
+            path_prob = [3 * i / sum(path_prob) for i in path_prob]
+            for i, j, k in zip(tmp_codon, path_prob, paths):
+                TV = [
+                    p + q
+                    for p, q in zip(
+                        TV, count_TV(codon1, i[0], k[0], codon_table, weight=j / 3)
+                    )
+                ]
+                TV = [
+                    p + q
+                    for p, q in zip(
+                        TV, count_TV(i[0], i[1], k[1], codon_table, weight=j / 3)
+                    )
+                ]
+                TV = [
+                    p + q
+                    for p, q in zip(
+                        TV, count_TV(i[1], codon2, k[1], codon_table, weight=j / 3)
+                    )
+                ]
+    return TV
+
+
+#################################################################
+#        private functions for Maximum Likelihood method
+#################################################################
+
+
+def _ml(seq1, seq2, cmethod, codon_table):
+    """ML method main function (PRIVATE)."""
+    from collections import Counter
+    from scipy.optimize import minimize
+
+    codon_cnt = Counter()
+    pi = _get_pi(seq1, seq2, cmethod, codon_table=codon_table)
+    for i, j in zip(seq1, seq2):
+        # if i != j and ('---' not in (i, j)):
+        if "---" not in (i, j):
+            codon_cnt[(i, j)] += 1
+    codon_lst = [
+        i
+        for i in list(codon_table.forward_table.keys()) + codon_table.stop_codons
+        if "U" not in i
+    ]
+
+    # apply optimization
+    def func(
+        params, pi=pi, codon_cnt=codon_cnt, codon_lst=codon_lst, codon_table=codon_table
+    ):
+        """Temporary function, params = [t, k, w]."""
+        return -_likelihood_func(
+            params[0],
+            params[1],
+            params[2],
+            pi,
+            codon_cnt,
+            codon_lst=codon_lst,
+            codon_table=codon_table,
+        )
+
+    # count sites
+    opt_res = minimize(
+        func,
+        [1, 0.1, 2],
+        method="L-BFGS-B",
+        bounds=((1e-10, 20), (1e-10, 20), (1e-10, 10)),
+        tol=1e-5,
+    )
+    t, k, w = opt_res.x
+    Q = _get_Q(pi, k, w, codon_lst, codon_table)
+    Sd = Nd = 0
+    for i, c1 in enumerate(codon_lst):
+        for j, c2 in enumerate(codon_lst):
+            if i != j:
+                try:
+                    if codon_table.forward_table[c1] == codon_table.forward_table[c2]:
+                        # synonymous count
+                        Sd += pi[c1] * Q[i, j]
+                    else:
+                        # nonsynonymous count
+                        Nd += pi[c1] * Q[i, j]
+                except KeyError:
+                    # This is probably due to stop codons
+                    pass
+    Sd *= t
+    Nd *= t
+
+    # count differences (with w fixed to 1)
+    def func_w1(
+        params, pi=pi, codon_cnt=codon_cnt, codon_lst=codon_lst, codon_table=codon_table
+    ):
+        """Temporary function, params = [t, k]. w is fixed to 1."""
+        return -_likelihood_func(
+            params[0],
+            params[1],
+            1.0,
+            pi,
+            codon_cnt,
+            codon_lst=codon_lst,
+            codon_table=codon_table,
+        )
+
+    opt_res = minimize(
+        func_w1,
+        [1, 0.1],
+        method="L-BFGS-B",
+        bounds=((1e-10, 20), (1e-10, 20)),
+        tol=1e-5,
+    )
+    t, k = opt_res.x
+    w = 1.0
+    Q = _get_Q(pi, k, w, codon_lst, codon_table)
+    rhoS = rhoN = 0
+    for i, c1 in enumerate(codon_lst):
+        for j, c2 in enumerate(codon_lst):
+            if i != j:
+                try:
+                    if codon_table.forward_table[c1] == codon_table.forward_table[c2]:
+                        # synonymous count
+                        rhoS += pi[c1] * Q[i, j]
+                    else:
+                        # nonsynonymous count
+                        rhoN += pi[c1] * Q[i, j]
+                except KeyError:
+                    # This is probably due to stop codons
+                    pass
+    rhoS *= 3
+    rhoN *= 3
+    dN = Nd / rhoN
+    dS = Sd / rhoS
+    return dN, dS
+
+
+def _get_pi(seq1, seq2, cmethod, codon_table):
+    """Obtain codon frequency dict (pi) from two codon list (PRIVATE).
+
+    This function is designed for ML method. Available counting methods
+    (cfreq) are F1x4, F3x4 and F64.
+    """
+    # TODO:
+    # Stop codon should not be allowed according to Yang.
+    # Try to modify this!
+    pi = {}
+    if cmethod == "F1x4":
+        fcodon = {"A": 0, "G": 0, "C": 0, "T": 0}
+        for i in seq1 + seq2:
+            if i != "---":
+                for c in i:
+                    fcodon[c] += 1
+        tot = sum(fcodon.values())
+        fcodon = {j: k / tot for j, k in fcodon.items()}
+        for i in codon_table.forward_table.keys() + codon_table.stop_codons:
+            if "U" not in i:
+                pi[i] = fcodon[i[0]] * fcodon[i[1]] * fcodon[i[2]]
+    elif cmethod == "F3x4":
+        # three codon position
+        fcodon = [
+            {"A": 0, "G": 0, "C": 0, "T": 0},
+            {"A": 0, "G": 0, "C": 0, "T": 0},
+            {"A": 0, "G": 0, "C": 0, "T": 0},
+        ]
+        for i in seq1 + seq2:
+            if i != "---":
+                fcodon[0][i[0]] += 1
+                fcodon[1][i[1]] += 1
+                fcodon[2][i[2]] += 1
+        for i in range(3):
+            tot = sum(fcodon[i].values())
+            fcodon[i] = {j: k / tot for j, k in fcodon[i].items()}
+        for i in list(codon_table.forward_table.keys()) + codon_table.stop_codons:
+            if "U" not in i:
+                pi[i] = fcodon[0][i[0]] * fcodon[1][i[1]] * fcodon[2][i[2]]
+    elif cmethod == "F61":
+        for i in codon_table.forward_table.keys() + codon_table.stop_codons:
+            if "U" not in i:
+                pi[i] = 0.1
+        for i in seq1 + seq2:
+            if i != "---":
+                pi[i] += 1
+        tot = sum(pi.values())
+        pi = {j: k / tot for j, k in pi.items()}
+    return pi
+
+
+def _q(i, j, pi, k, w, codon_table):
+    """Q matrix for codon substitution (PRIVATE).
+
+    Arguments:
+     - i, j  : three letter codon string
+     - pi    : expected codon frequency
+     - k     : transition/transversion ratio
+     - w     : nonsynonymous/synonymous rate ratio
+     - codon_table: Bio.Data.CodonTable object
+
+    """
+    if i == j:
+        # diagonal elements is the sum of all other elements
+        return 0
+    if i in codon_table.stop_codons or j in codon_table.stop_codons:
+        return 0
+    if (i not in pi) or (j not in pi):
+        return 0
+    purine = ("A", "G")
+    pyrimidine = ("T", "C")
+    diff = []
+    for n, (c1, c2) in enumerate(zip(i, j)):
+        if c1 != c2:
+            diff.append((n, c1, c2))
+    if len(diff) >= 2:
+        return 0
+    if codon_table.forward_table[i] == codon_table.forward_table[j]:
+        # synonymous substitution
+        if diff[0][1] in purine and diff[0][2] in purine:
+            # transition
+            return k * pi[j]
+        elif diff[0][1] in pyrimidine and diff[0][2] in pyrimidine:
+            # transition
+            return k * pi[j]
+        else:
+            # transversion
+            return pi[j]
+    else:
+        # nonsynonymous substitution
+        if diff[0][1] in purine and diff[0][2] in purine:
+            # transition
+            return w * k * pi[j]
+        elif diff[0][1] in pyrimidine and diff[0][2] in pyrimidine:
+            # transition
+            return w * k * pi[j]
+        else:
+            # transversion
+            return w * pi[j]
+
+
+def _get_Q(pi, k, w, codon_lst, codon_table):
+    """Q matrix for codon substitution (PRIVATE)."""
+    import numpy as np
+
+    codon_num = len(codon_lst)
+    Q = np.zeros((codon_num, codon_num))
+    for i in range(codon_num):
+        for j in range(codon_num):
+            if i != j:
+                Q[i, j] = _q(
+                    codon_lst[i], codon_lst[j], pi, k, w, codon_table=codon_table
+                )
+    nucl_substitutions = 0
+    for i in range(codon_num):
+        Q[i, i] = -sum(Q[i, :])
+        try:
+            nucl_substitutions += pi[codon_lst[i]] * (-Q[i, i])
+        except KeyError:
+            pass
+    Q = Q / nucl_substitutions
+    return Q
+
+
+def _likelihood_func(t, k, w, pi, codon_cnt, codon_lst, codon_table):
+    """Likelihood function for ML method (PRIVATE)."""
+    from scipy.linalg import expm
+
+    Q = _get_Q(pi, k, w, codon_lst, codon_table)
+    P = expm(Q * t)
+    likelihood = 0
+    for i, c1 in enumerate(codon_lst):
+        for j, c2 in enumerate(codon_lst):
+            if (c1, c2) in codon_cnt:
+                if P[i, j] * pi[c1] <= 0:
+                    likelihood += codon_cnt[(c1, c2)] * 0
+                else:
+                    likelihood += codon_cnt[(c1, c2)] * log(pi[c1] * P[i, j])
+    return likelihood
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/cpairwise2.cp37-win_amd64.pyd b/code/lib/Bio/cpairwise2.cp37-win_amd64.pyd
new file mode 100644
index 0000000..6826be4
Binary files /dev/null and b/code/lib/Bio/cpairwise2.cp37-win_amd64.pyd differ
diff --git a/code/lib/Bio/cpairwise2module.c b/code/lib/Bio/cpairwise2module.c
new file mode 100644
index 0000000..af7843e
--- /dev/null
+++ b/code/lib/Bio/cpairwise2module.c
@@ -0,0 +1,479 @@
+/* Copyright 2002 by Jeffrey Chang.
+ * Copyright 2016, 2019 by Markus Piotrowski.
+ * All rights reserved.
+ *
+ * This file is part of the Biopython distribution and governed by your
+ * choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+ * Please see the LICENSE file that should have been included as part of this
+ * package.
+ *
+ * cpairwise2module.c
+ * Created 30 Sep 2001
+ *
+ * Optimized C routines that complement pairwise2.py.
+ */
+
+#include "Python.h"
+
+
+#define _PRECISION 1000
+#define rint(x) (int)((x)*_PRECISION+0.5)
+
+/* Functions in this module. */
+
+static double calc_affine_penalty(int length, double open, double extend,
+    int penalize_extend_when_opening)
+{
+    double penalty;
+
+    if(length <= 0)
+        return 0.0;
+    penalty = open + extend * length;
+    if(!penalize_extend_when_opening)
+        penalty -= extend;
+    return penalty;
+}
+
+static double _get_match_score(PyObject *py_sequenceA, PyObject *py_sequenceB,
+                               PyObject *py_match_fn, int i, int j,
+                               char *sequenceA, char *sequenceB,
+                               int use_sequence_cstring,
+                               double match, double mismatch,
+                               int use_match_mismatch_scores)
+{
+    PyObject *py_A=NULL, *py_B=NULL;
+    PyObject *py_arglist=NULL, *py_result=NULL;
+    double score = 0;
+
+    if(use_sequence_cstring && use_match_mismatch_scores) {
+        score = (sequenceA[i] == sequenceB[j]) ? match : mismatch;
+        return score;
+    }
+    /* Calculate the match score. */
+    if(!(py_A = PySequence_GetItem(py_sequenceA, i)))
+        goto _get_match_score_cleanup;
+    if(!(py_B = PySequence_GetItem(py_sequenceB, j)))
+        goto _get_match_score_cleanup;
+    if(!(py_arglist = Py_BuildValue("(OO)", py_A, py_B)))
+        goto _get_match_score_cleanup;
+
+    if(!(py_result = PyEval_CallObject(py_match_fn, py_arglist)))
+        goto _get_match_score_cleanup;
+    score = PyFloat_AsDouble(py_result);
+
+ _get_match_score_cleanup:
+    if(py_A) {
+        Py_DECREF(py_A);
+    }
+    if(py_B) {
+        Py_DECREF(py_B);
+    }
+    if(py_arglist) {
+        Py_DECREF(py_arglist);
+    }
+    if(py_result) {
+        Py_DECREF(py_result);
+    }
+    return score;
+}
+
+#if PY_MAJOR_VERSION >= 3
+static PyObject* _create_bytes_object(PyObject* o)
+{
+    PyObject* b;
+    if (PyBytes_Check(o)) {
+        return o;
+    }
+    if (!PyUnicode_Check(o)) {
+        return NULL;
+    }
+    b = PyUnicode_AsASCIIString(o);
+    if (!b) {
+        PyErr_Clear();
+        return NULL;
+    }
+    return b;
+}
+#endif
+
+/* This function is a more-or-less straightforward port of the
+ * equivalent function in pairwise2. Please see there for algorithm
+ * documentation.
+ */
+static PyObject *cpairwise2__make_score_matrix_fast(PyObject *self,
+                                                    PyObject *args)
+{
+    int i;
+    int row, col;
+    PyObject *py_sequenceA, *py_sequenceB, *py_match_fn;
+#if PY_MAJOR_VERSION >= 3
+    PyObject *py_bytesA, *py_bytesB;
+#endif
+    char *sequenceA=NULL, *sequenceB=NULL;
+    int use_sequence_cstring;
+    double open_A, extend_A, open_B, extend_B;
+    int penalize_extend_when_opening, penalize_end_gaps_A, penalize_end_gaps_B;
+    int align_globally, score_only;
+
+    PyObject *py_match=NULL, *py_mismatch=NULL;
+    double first_A_gap, first_B_gap;
+    double match, mismatch;
+    double score;
+    double best_score = 0;
+    double local_max_score = 0;
+    int use_match_mismatch_scores;
+    int lenA, lenB;
+    double *score_matrix = NULL;
+    unsigned char *trace_matrix = NULL;
+    PyObject *py_score_matrix=NULL, *py_trace_matrix=NULL;
+
+    double *col_cache_score = NULL;
+    PyObject *py_retval = NULL;
+
+    if(!PyArg_ParseTuple(args, "OOOddddi(ii)ii", &py_sequenceA, &py_sequenceB,
+                         &py_match_fn, &open_A, &extend_A, &open_B, &extend_B,
+                         &penalize_extend_when_opening,
+                         &penalize_end_gaps_A, &penalize_end_gaps_B,
+                         &align_globally, &score_only))
+        return NULL;
+    if(!PySequence_Check(py_sequenceA) || !PySequence_Check(py_sequenceB)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "py_sequenceA and py_sequenceB should be sequences.");
+        return NULL;
+    }
+
+    /* Optimize for the common case. Check to see if py_sequenceA and
+       py_sequenceB are strings.  If they are, use the c string
+       representation. */
+#if PY_MAJOR_VERSION < 3
+    use_sequence_cstring = 0;
+    if(PyString_Check(py_sequenceA) && PyString_Check(py_sequenceB)) {
+        sequenceA = PyString_AS_STRING(py_sequenceA);
+        sequenceB = PyString_AS_STRING(py_sequenceB);
+        use_sequence_cstring = 1;
+    }
+#else
+    py_bytesA = _create_bytes_object(py_sequenceA);
+    py_bytesB = _create_bytes_object(py_sequenceB);
+    if (py_bytesA && py_bytesB) {
+        sequenceA = PyBytes_AS_STRING(py_bytesA);
+        sequenceB = PyBytes_AS_STRING(py_bytesB);
+        use_sequence_cstring = 1;
+    }
+    else {
+        Py_XDECREF(py_bytesA);
+        Py_XDECREF(py_bytesB);
+        use_sequence_cstring = 0;
+    }
+#endif
+
+    if(!PyCallable_Check(py_match_fn)) {
+        PyErr_SetString(PyExc_TypeError, "py_match_fn must be callable.");
+        return NULL;
+    }
+    /* Optimize for the common case. Check to see if py_match_fn is
+       an identity_match. If so, pull out the match and mismatch
+       member variables and calculate the scores myself. */
+    match = mismatch = 0;
+    use_match_mismatch_scores = 0;
+    if(!(py_match = PyObject_GetAttrString(py_match_fn, "match")))
+        goto cleanup_after_py_match_fn;
+    match = PyFloat_AsDouble(py_match);
+    if(match==-1.0 && PyErr_Occurred())
+        goto cleanup_after_py_match_fn;
+    if(!(py_mismatch = PyObject_GetAttrString(py_match_fn, "mismatch")))
+        goto cleanup_after_py_match_fn;
+    mismatch = PyFloat_AsDouble(py_mismatch);
+    if(mismatch==-1.0 && PyErr_Occurred())
+        goto cleanup_after_py_match_fn;
+    use_match_mismatch_scores = 1;
+
+ cleanup_after_py_match_fn:
+    if(PyErr_Occurred())
+        PyErr_Clear();
+    if(py_match) {
+        Py_DECREF(py_match);
+    }
+    if(py_mismatch) {
+        Py_DECREF(py_mismatch);
+    }
+    /* Cache some commonly used gap penalties */
+    first_A_gap = calc_affine_penalty(1, open_A, extend_A,
+                                      penalize_extend_when_opening);
+    first_B_gap = calc_affine_penalty(1, open_B, extend_B,
+                                      penalize_extend_when_opening);
+
+    /* Allocate matrices for storing the results and initialize first row and col. */
+    lenA = PySequence_Length(py_sequenceA);
+    lenB = PySequence_Length(py_sequenceB);
+    score_matrix = malloc((lenA+1)*(lenB+1)*sizeof(*score_matrix));
+    if(!score_matrix) {
+        PyErr_SetString(PyExc_MemoryError, "Out of memory");
+        goto _cleanup_make_score_matrix_fast;
+    }
+    for(i=0; i<(lenB+1); i++)
+        score_matrix[i] = 0;
+    for(i=0; i<(lenA+1)*(lenB+1); i += (lenB+1))
+        score_matrix[i] = 0;
+    /* If we only want the score, we don't need the trace matrix. */
+    if (!score_only){
+        trace_matrix = malloc((lenA+1)*(lenB+1)*sizeof(*trace_matrix));
+        if(!trace_matrix) {
+            PyErr_SetString(PyExc_MemoryError, "Out of memory");
+            goto _cleanup_make_score_matrix_fast;
+        }
+        for(i=0; i<(lenB+1); i++)
+            trace_matrix[i] = 0;
+        for(i=0; i<(lenA+1)*(lenB+1); i += (lenB+1))
+            trace_matrix[i] = 0;
+        }
+    else
+        trace_matrix = malloc(1);
+
+    /* Initialize the first row and col of the score matrix. */
+    for(i=0; i<=lenA; i++) {
+        if(penalize_end_gaps_B)
+            score = calc_affine_penalty(i, open_B, extend_B,
+                                        penalize_extend_when_opening);
+        else
+            score = 0;
+        score_matrix[i*(lenB+1)] = score;
+    }
+    for(i=0; i<=lenB; i++) {
+        if(penalize_end_gaps_A)
+            score = calc_affine_penalty(i, open_A, extend_A,
+                                        penalize_extend_when_opening);
+        else
+            score = 0;
+        score_matrix[i] = score;
+    }
+
+    /* Now initialize the col cache. */
+    col_cache_score = malloc((lenB+1)*sizeof(*col_cache_score));
+    memset((void *)col_cache_score, 0, (lenB+1)*sizeof(*col_cache_score));
+    for(i=0; i<=lenB; i++) {
+        col_cache_score[i] = calc_affine_penalty(i, (2*open_B), extend_B,
+                             penalize_extend_when_opening);
+    }
+
+    /* Fill in the score matrix. The row cache is calculated on the fly.*/
+    for(row=1; row<=lenA; row++) {
+        double row_cache_score = calc_affine_penalty(row, (2*open_A), extend_A,
+                                 penalize_extend_when_opening);
+        for(col=1; col<=lenB; col++) {
+            double match_score, nogap_score;
+            double row_open, row_extend, col_open, col_extend;
+            int best_score_rint, row_score_rint, col_score_rint;
+            unsigned char row_trace_score, col_trace_score, trace_score;
+
+            /* Calculate the best score. */
+            match_score = _get_match_score(py_sequenceA, py_sequenceB,
+                                           py_match_fn, row-1, col-1,
+                                           sequenceA, sequenceB,
+                                           use_sequence_cstring,
+                                           match, mismatch,
+                                           use_match_mismatch_scores);
+            if(match_score==-1.0 && PyErr_Occurred())
+                goto _cleanup_make_score_matrix_fast;
+            nogap_score = score_matrix[(row-1)*(lenB+1)+col-1] + match_score;
+
+            if (!penalize_end_gaps_A && row==lenA) {
+                row_open = score_matrix[(row)*(lenB+1)+col-1];
+                row_extend = row_cache_score;
+            }
+            else {
+                row_open = score_matrix[(row)*(lenB+1)+col-1] + first_A_gap;
+                row_extend = row_cache_score + extend_A;
+            }
+            row_cache_score = (row_open > row_extend) ? row_open : row_extend;
+
+            if (!penalize_end_gaps_B && col==lenB){
+                col_open = score_matrix[(row-1)*(lenB+1)+col];
+                col_extend = col_cache_score[col];
+            }
+            else {
+                col_open = score_matrix[(row-1)*(lenB+1)+col] + first_B_gap;
+                col_extend = col_cache_score[col] + extend_B;
+            }
+            col_cache_score[col] = (col_open > col_extend) ? col_open : col_extend;
+
+            best_score = (row_cache_score > col_cache_score[col]) ? row_cache_score : col_cache_score[col];
+            if(nogap_score > best_score)
+                best_score = nogap_score;
+
+            if (best_score > local_max_score)
+                local_max_score = best_score;
+
+            if(!align_globally && best_score < 0)
+                score_matrix[row*(lenB+1)+col] = 0;
+            else
+                score_matrix[row*(lenB+1)+col] = best_score;
+
+            if (!score_only) {
+                row_score_rint = rint(row_cache_score);
+                col_score_rint = rint(col_cache_score[col]);
+                row_trace_score = 0;
+                col_trace_score = 0;
+                if (rint(row_open) == row_score_rint)
+                    row_trace_score = row_trace_score|1;
+                if (rint(row_extend) == row_score_rint)
+                    row_trace_score = row_trace_score|8;
+                if (rint(col_open) == col_score_rint)
+                    col_trace_score = col_trace_score|4;
+                if (rint(col_extend) == col_score_rint)
+                    col_trace_score = col_trace_score|16;
+
+                trace_score = 0;
+                best_score_rint = rint(best_score);
+                if (rint(nogap_score) == best_score_rint)
+                    trace_score = trace_score|2;
+                if (row_score_rint == best_score_rint)
+                    trace_score += row_trace_score;
+                if (col_score_rint == best_score_rint)
+                    trace_score += col_trace_score;
+                trace_matrix[row*(lenB+1)+col] = trace_score;
+            }
+        }
+    }
+
+    if (!align_globally)
+        best_score = local_max_score;
+
+    /* Save the score and traceback matrices into real python objects. */
+	if(!score_only) {
+		if(!(py_score_matrix = PyList_New(lenA+1)))
+			goto _cleanup_make_score_matrix_fast;
+		if(!(py_trace_matrix = PyList_New(lenA+1)))
+			goto _cleanup_make_score_matrix_fast;
+
+		for(row=0; row<=lenA; row++) {
+			PyObject *py_score_row, *py_trace_row;
+			if(!(py_score_row = PyList_New(lenB+1)))
+				goto _cleanup_make_score_matrix_fast;
+			PyList_SET_ITEM(py_score_matrix, row, py_score_row);
+			if(!(py_trace_row = PyList_New(lenB+1)))
+				goto _cleanup_make_score_matrix_fast;
+			PyList_SET_ITEM(py_trace_matrix, row, py_trace_row);
+
+			for(col=0; col<=lenB; col++) {
+				PyObject *py_score, *py_trace;
+				int offset = row*(lenB+1) + col;
+
+				/* Set py_score_matrix[row][col] to the score. */
+				if(!(py_score = PyFloat_FromDouble(score_matrix[offset])))
+					goto _cleanup_make_score_matrix_fast;
+				PyList_SET_ITEM(py_score_row, col, py_score);
+
+				/* Set py_trace_matrix[row][col] to a list of indexes.  On
+				   the edges of the matrix (row or column is 0), the
+				   matrix should be [None]. */
+				if(!row || !col) {
+					if(!(py_trace = Py_BuildValue("B", 1)))
+						goto _cleanup_make_score_matrix_fast;
+					Py_INCREF(Py_None);
+					PyList_SET_ITEM(py_trace_row, col, Py_None);
+				}
+				else {
+					if(!(py_trace = Py_BuildValue("B", trace_matrix[offset])))
+						goto _cleanup_make_score_matrix_fast;
+					PyList_SET_ITEM(py_trace_row, col, py_trace);
+
+				}
+			}
+		}
+	}
+	else {
+		py_score_matrix = PyList_New(1);
+		py_trace_matrix = PyList_New(1);
+	}
+    py_retval = Py_BuildValue("(OOd)", py_score_matrix, py_trace_matrix, best_score);
+
+ _cleanup_make_score_matrix_fast:
+    if(score_matrix)
+        free(score_matrix);
+    if(trace_matrix)
+        free(trace_matrix);
+    if(col_cache_score)
+        free(col_cache_score);
+    if(py_score_matrix){
+        Py_DECREF(py_score_matrix);
+    }
+    if(py_trace_matrix){
+        Py_DECREF(py_trace_matrix);
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    if (py_bytesA != NULL && py_bytesA != py_sequenceA) Py_DECREF(py_bytesA);
+    if (py_bytesB != NULL && py_bytesB != py_sequenceB) Py_DECREF(py_bytesB);
+#endif
+
+    return py_retval;
+}
+
+static PyObject *cpairwise2_rint(PyObject *self, PyObject *args,
+                                 PyObject *keywds)
+{
+    double x;
+    int precision = _PRECISION;
+    int rint_x;
+
+    static char *kwlist[] = {"x", "precision", NULL};
+
+    if(!PyArg_ParseTupleAndKeywords(args, keywds, "d|l", kwlist,
+                                    &x, &precision))
+        return NULL;
+    rint_x = (int)(x * precision + 0.5);
+#if PY_MAJOR_VERSION >= 3
+    return PyLong_FromLong((long)rint_x);
+#else
+    return PyInt_FromLong((long)rint_x);
+#endif
+}
+
+/* Module definition stuff */
+
+static PyMethodDef cpairwise2Methods[] = {
+    {"_make_score_matrix_fast",
+     (PyCFunction)cpairwise2__make_score_matrix_fast, METH_VARARGS, ""},
+    {"rint", (PyCFunction)cpairwise2_rint, METH_VARARGS|METH_KEYWORDS, ""},
+    {NULL, NULL, 0, NULL}
+};
+
+static char cpairwise2__doc__[] =
+"Optimized C routines that complement pairwise2.py. These are called from within pairwise2.py.\n\
+\n\
+";
+
+#if PY_MAJOR_VERSION >= 3
+
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        "cpairwise2",
+        cpairwise2__doc__,
+        -1,
+        cpairwise2Methods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+
+PyObject *
+PyInit_cpairwise2(void)
+
+#else
+
+void
+/* for Windows: _declspec(dllexport) initcpairwise2(void) */
+initcpairwise2(void)
+#endif
+
+{
+#if PY_MAJOR_VERSION >= 3
+    PyObject* module = PyModule_Create(&moduledef);
+    if (module==NULL) return NULL;
+    return module;
+#else
+    (void) Py_InitModule3("cpairwise2", cpairwise2Methods, cpairwise2__doc__);
+#endif
+}
diff --git a/code/lib/Bio/kNN.py b/code/lib/Bio/kNN.py
new file mode 100644
index 0000000..b3e6260
--- /dev/null
+++ b/code/lib/Bio/kNN.py
@@ -0,0 +1,138 @@
+# Copyright 2002 by Jeffrey Chang.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code for doing k-nearest-neighbors classification.
+
+k Nearest Neighbors is a supervised learning algorithm that classifies
+a new observation based the classes in its surrounding neighborhood.
+
+Glossary:
+ - distance   The distance between two points in the feature space.
+ - weight     The importance given to each point for classification.
+
+Classes:
+ - kNN           Holds information for a nearest neighbors classifier.
+
+
+Functions:
+ - train        Train a new kNN classifier.
+ - calculate    Calculate the probabilities of each class, given an observation.
+ - classify     Classify an observation into a class.
+
+Weighting Functions:
+ - equal_weight    Every example is given a weight of 1.
+
+"""
+
+import numpy
+
+
+class kNN:
+    """Holds information necessary to do nearest neighbors classification.
+
+    Attribues:
+     - classes  Set of the possible classes.
+     - xs       List of the neighbors.
+     - ys       List of the classes that the neighbors belong to.
+     - k        Number of neighbors to look at.
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.classes = set()
+        self.xs = []
+        self.ys = []
+        self.k = None
+
+
+def equal_weight(x, y):
+    """Return integer one (dummy method for equally weighting)."""
+    # everything gets 1 vote
+    return 1
+
+
+def train(xs, ys, k, typecode=None):
+    """Train a k nearest neighbors classifier on a training set.
+
+    xs is a list of observations and ys is a list of the class assignments.
+    Thus, xs and ys should contain the same number of elements.  k is
+    the number of neighbors that should be examined when doing the
+    classification.
+    """
+    knn = kNN()
+    knn.classes = set(ys)
+    knn.xs = numpy.asarray(xs, typecode)
+    knn.ys = ys
+    knn.k = k
+    return knn
+
+
+def calculate(knn, x, weight_fn=None, distance_fn=None):
+    """Calculate the probability for each class.
+
+    Arguments:
+     - x is the observed data.
+     - weight_fn is an optional function that takes x and a training
+       example, and returns a weight.
+     - distance_fn is an optional function that takes two points and
+       returns the distance between them.  If distance_fn is None (the
+       default), the Euclidean distance is used.
+
+    Returns a dictionary of the class to the weight given to the class.
+    """
+    if weight_fn is None:
+        weight_fn = equal_weight
+
+    x = numpy.asarray(x)
+
+    order = []  # list of (distance, index)
+    if distance_fn:
+        for i in range(len(knn.xs)):
+            dist = distance_fn(x, knn.xs[i])
+            order.append((dist, i))
+    else:
+        # Default: Use a fast implementation of the Euclidean distance
+        temp = numpy.zeros(len(x))
+        # Predefining temp allows reuse of this array, making this
+        # function about twice as fast.
+        for i in range(len(knn.xs)):
+            temp[:] = x - knn.xs[i]
+            dist = numpy.sqrt(numpy.dot(temp, temp))
+            order.append((dist, i))
+    order.sort()
+
+    # first 'k' are the ones I want.
+    weights = {}  # class -> number of votes
+    for k in knn.classes:
+        weights[k] = 0.0
+    for dist, i in order[: knn.k]:
+        klass = knn.ys[i]
+        weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
+
+    return weights
+
+
+def classify(knn, x, weight_fn=None, distance_fn=None):
+    """Classify an observation into a class.
+
+    If not specified, weight_fn will give all neighbors equal weight.
+    distance_fn is an optional function that takes two points and returns
+    the distance between them.  If distance_fn is None (the default),
+    the Euclidean distance is used.
+    """
+    if weight_fn is None:
+        weight_fn = equal_weight
+
+    weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
+
+    most_class = None
+    most_weight = None
+    for klass, weight in weights.items():
+        if most_class is None or weight > most_weight:
+            most_class = klass
+            most_weight = weight
+    return most_class
diff --git a/code/lib/Bio/motifs/__init__.py b/code/lib/Bio/motifs/__init__.py
new file mode 100644
index 0000000..1dad775
--- /dev/null
+++ b/code/lib/Bio/motifs/__init__.py
@@ -0,0 +1,610 @@
+# Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved.
+# Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved.
+# Revisions copyright 2019 by Victor Lin.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Tools for sequence motif analysis.
+
+Bio.motifs contains the core Motif class containing various I/O methods
+as well as methods for motif comparisons and motif searching in sequences.
+It also includes functionality for parsing output from the AlignACE, MEME,
+and MAST programs, as well as files in the TRANSFAC format.
+"""
+
+import warnings
+
+from urllib.parse import urlencode
+from urllib.request import urlopen, Request
+
+
+def create(instances, alphabet="ACGT"):
+    """Create a Motif object."""
+    instances = Instances(instances, alphabet)
+    return Motif(instances=instances, alphabet=alphabet)
+
+
+def parse(handle, fmt, strict=True):
+    """Parse an output file from a motif finding program.
+
+    Currently supported formats (case is ignored):
+     - AlignAce:         AlignAce output file format
+     - ClusterBuster:    Cluster Buster position frequency matrix format
+     - XMS:              XMS matrix format
+     - MEME:             MEME output file motif
+     - MINIMAL:          MINIMAL MEME output file motif
+     - MAST:             MAST output file motif
+     - TRANSFAC:         TRANSFAC database file format
+     - pfm-four-columns: Generic position-frequency matrix format with four columns. (cisbp, homer, hocomoco, neph, tiffin)
+     - pfm-four-rows:    Generic position-frequency matrix format with four row. (scertf, yetfasco, hdpi, idmmpmm, flyfactor survey)
+     - pfm:              JASPAR-style position-frequency matrix
+     - jaspar:           JASPAR-style multiple PFM format
+     - sites:            JASPAR-style sites file
+
+    As files in the pfm and sites formats contain only a single motif,
+    it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
+    for those.
+
+    For example:
+
+    >>> from Bio import motifs
+    >>> with open("motifs/alignace.out") as handle:
+    ...     for m in motifs.parse(handle, "AlignAce"):
+    ...         print(m.consensus)
+    ...
+    TCTACGATTGAG
+    CTGCACCTAGCTACGAGTGAG
+    GTGCCCTAAGCATACTAGGCG
+    GCCACTAGCAGAGCAGGGGGC
+    CGACTCAGAGGTT
+    CCACGCTAAGAGAAGTGCCGGAG
+    GCACGTCCCTGAGCA
+    GTCCATCGCAAAGCGTGGGGC
+    GAGATCAGAGGGCCG
+    TGGACGCGGGG
+    GACCAGAGCCTCGCATGGGGG
+    AGCGCGCGTG
+    GCCGGTTGCTGTTCATTAGG
+    ACCGACGGCAGCTAAAAGGG
+    GACGCCGGGGAT
+    CGACTCGCGCTTACAAGG
+
+    If strict is True (default), the parser will raise a ValueError if the
+    file contents does not strictly comply with the specified file format.
+    """
+    fmt = fmt.lower()
+    if fmt == "alignace":
+        from Bio.motifs import alignace
+
+        return alignace.read(handle)
+    elif fmt == "meme":
+        from Bio.motifs import meme
+
+        return meme.read(handle)
+    elif fmt == "minimal":
+        from Bio.motifs import minimal
+
+        return minimal.read(handle)
+    elif fmt == "clusterbuster":
+        from Bio.motifs import clusterbuster
+
+        return clusterbuster.read(handle)
+    elif fmt in ("pfm-four-columns", "pfm-four-rows"):
+        from Bio.motifs import pfm
+
+        return pfm.read(handle, fmt)
+    elif fmt == "xms":
+        from Bio.motifs import xms
+
+        return xms.read(handle)
+    elif fmt == "mast":
+        from Bio.motifs import mast
+
+        return mast.read(handle)
+    elif fmt == "transfac":
+        from Bio.motifs import transfac
+
+        return transfac.read(handle, strict)
+    elif fmt in ("pfm", "sites", "jaspar"):
+        from Bio.motifs import jaspar
+
+        return jaspar.read(handle, fmt)
+    else:
+        raise ValueError("Unknown format %s" % fmt)
+
+
+def read(handle, fmt, strict=True):
+    """Read a motif from a handle using the specified file-format.
+
+    This supports the same formats as Bio.motifs.parse(), but
+    only for files containing exactly one motif.  For example,
+    reading a JASPAR-style pfm file:
+
+    >>> from Bio import motifs
+    >>> with open("motifs/SRF.pfm") as handle:
+    ...     m = motifs.read(handle, "pfm")
+    >>> m.consensus
+    Seq('GCCCATATATGG')
+
+    Or a single-motif MEME file,
+
+    >>> from Bio import motifs
+    >>> with open("motifs/meme.psp_test.classic.zoops.xml") as handle:
+    ...     m = motifs.read(handle, "meme")
+    >>> m.consensus
+    Seq('GCTTATGTAA')
+
+    If the handle contains no records, or more than one record,
+    an exception is raised:
+
+    >>> from Bio import motifs
+    >>> with open("motifs/alignace.out") as handle:
+    ...     motif = motifs.read(handle, "AlignAce")
+    Traceback (most recent call last):
+        ...
+    ValueError: More than one motif found in handle
+
+    If however you want the first motif from a file containing
+    multiple motifs this function would raise an exception (as
+    shown in the example above).  Instead use:
+
+    >>> from Bio import motifs
+    >>> with open("motifs/alignace.out") as handle:
+    ...     record = motifs.parse(handle, "alignace")
+    >>> motif = record[0]
+    >>> motif.consensus
+    Seq('TCTACGATTGAG')
+
+    Use the Bio.motifs.parse(handle, fmt) function if you want
+    to read multiple records from the handle.
+
+    If strict is True (default), the parser will raise a ValueError if the
+    file contents does not strictly comply with the specified file format.
+    """
+    fmt = fmt.lower()
+    motifs = parse(handle, fmt, strict)
+    if len(motifs) == 0:
+        raise ValueError("No motifs found in handle")
+    if len(motifs) > 1:
+        raise ValueError("More than one motif found in handle")
+    motif = motifs[0]
+    return motif
+
+
+class Instances(list):
+    """Class containing a list of sequences that made the motifs."""
+
+    def __init__(self, instances=None, alphabet="ACGT"):
+        """Initialize the class."""
+        from Bio.Seq import Seq
+
+        length = None
+        if instances is not None:
+            sequences = []
+            for instance in instances:
+                if length is None:
+                    length = len(instance)
+                elif length != len(instance):
+                    message = (
+                        "All instances should have the same length (%d found, %d expected)"
+                        % (len(instance), length)
+                    )
+                    raise ValueError(message)
+                if not isinstance(instance, Seq):
+                    instance = Seq(str(instance))
+                sequences.append(instance)
+            # no errors were raised; store the instances:
+            self.extend(sequences)
+        self.length = length
+        self.alphabet = alphabet
+
+    def __str__(self):
+        """Return a string containing the sequences of the motif."""
+        text = ""
+        for instance in self:
+            text += str(instance) + "\n"
+        return text
+
+    def count(self):
+        """Count nucleotides in a position."""
+        counts = {}
+        for letter in self.alphabet:
+            counts[letter] = [0] * self.length
+        for instance in self:
+            for position, letter in enumerate(instance):
+                counts[letter][position] += 1
+        return counts
+
+    def search(self, sequence):
+        """Find positions of motifs in a given sequence.
+
+        This is a generator function, returning found positions of motif
+        instances in a given sequence.
+        """
+        for pos in range(0, len(sequence) - self.length + 1):
+            for instance in self:
+                if instance == sequence[pos : pos + self.length]:
+                    yield (pos, instance)
+                    break  # no other instance will fit (we don't want to return multiple hits)
+
+    def reverse_complement(self):
+        """Compute reverse complement of sequences."""
+        instances = Instances(alphabet=self.alphabet)
+        instances.length = self.length
+        for instance in self:
+            instance = instance.reverse_complement()
+            instances.append(instance)
+        return instances
+
+
+class Motif:
+    """A class representing sequence motifs."""
+
+    def __init__(self, alphabet="ACGT", instances=None, counts=None):
+        """Initialize the class."""
+        from . import matrix
+
+        self.name = ""
+        if counts is not None and instances is not None:
+            raise Exception(
+                ValueError, "Specify either instances or counts, don't specify both"
+            )
+        elif counts is not None:
+            self.instances = None
+            self.counts = matrix.FrequencyPositionMatrix(alphabet, counts)
+            self.length = self.counts.length
+        elif instances is not None:
+            self.instances = instances
+            alphabet = self.instances.alphabet
+            counts = self.instances.count()
+            self.counts = matrix.FrequencyPositionMatrix(alphabet, counts)
+            self.length = self.counts.length
+        else:
+            self.counts = None
+            self.instances = None
+            self.length = None
+        self.alphabet = alphabet
+        self.pseudocounts = None
+        self.background = None
+        self.mask = None
+
+    def __get_mask(self):
+        return self.__mask
+
+    def __set_mask(self, mask):
+        if self.length is None:
+            self.__mask = ()
+        elif mask is None:
+            self.__mask = (1,) * self.length
+        elif len(mask) != self.length:
+            raise ValueError(
+                "The length (%d) of the mask is inconsistent with the length (%d) of the motif",
+                (len(mask), self.length),
+            )
+        elif isinstance(mask, str):
+            self.__mask = []
+            for char in mask:
+                if char == "*":
+                    self.__mask.append(1)
+                elif char == " ":
+                    self.__mask.append(0)
+                else:
+                    raise ValueError(
+                        "Mask should contain only '*' or ' ' and not a '%s'" % char
+                    )
+            self.__mask = tuple(self.__mask)
+        else:
+            self.__mask = tuple(int(bool(c)) for c in mask)
+
+    mask = property(__get_mask, __set_mask)
+    del __get_mask
+    del __set_mask
+
+    def __get_pseudocounts(self):
+        return self._pseudocounts
+
+    def __set_pseudocounts(self, value):
+        self._pseudocounts = {}
+        if isinstance(value, dict):
+            self._pseudocounts = {letter: value[letter] for letter in self.alphabet}
+        else:
+            if value is None:
+                value = 0.0
+            self._pseudocounts = dict.fromkeys(self.alphabet, value)
+
+    pseudocounts = property(__get_pseudocounts, __set_pseudocounts)
+    del __get_pseudocounts
+    del __set_pseudocounts
+
+    def __get_background(self):
+        return self._background
+
+    def __set_background(self, value):
+        if isinstance(value, dict):
+            self._background = {letter: value[letter] for letter in self.alphabet}
+        elif value is None:
+            self._background = dict.fromkeys(self.alphabet, 1.0)
+        else:
+            if sorted(self.alphabet) != ["A", "C", "G", "T"]:
+                raise ValueError(
+                    "Setting the background to a single value only works for DNA motifs"
+                    " (in which case the value is interpreted as the GC content)"
+                )
+            self._background["A"] = (1.0 - value) / 2.0
+            self._background["C"] = value / 2.0
+            self._background["G"] = value / 2.0
+            self._background["T"] = (1.0 - value) / 2.0
+        total = sum(self._background.values())
+        for letter in self.alphabet:
+            self._background[letter] /= total
+
+    background = property(__get_background, __set_background)
+    del __get_background
+    del __set_background
+
+    @property
+    def pwm(self):
+        """Compute position weight matrices."""
+        return self.counts.normalize(self._pseudocounts)
+
+    @property
+    def pssm(self):
+        """Compute position specific scoring matrices."""
+        return self.pwm.log_odds(self._background)
+
+    def __str__(self, masked=False):
+        """Return string representation of a motif."""
+        text = ""
+        if self.instances is not None:
+            text += str(self.instances)
+
+        if masked:
+            for i in range(self.length):
+                if self.__mask[i]:
+                    text += "*"
+                else:
+                    text += " "
+            text += "\n"
+        return text
+
+    def __len__(self):
+        """Return the length of a motif.
+
+        Please use this method (i.e. invoke len(m)) instead of referring to m.length directly.
+        """
+        if self.length is None:
+            return 0
+        else:
+            return self.length
+
+    def reverse_complement(self):
+        """Return the reverse complement of the motif as a new motif."""
+        alphabet = self.alphabet
+        if self.instances is not None:
+            instances = self.instances.reverse_complement()
+            res = Motif(alphabet=alphabet, instances=instances)
+        else:  # has counts
+            counts = {
+                "A": self.counts["T"][::-1],
+                "C": self.counts["G"][::-1],
+                "G": self.counts["C"][::-1],
+                "T": self.counts["A"][::-1],
+            }
+            res = Motif(alphabet=alphabet, counts=counts)
+        res.__mask = self.__mask[::-1]
+        res.background = {
+            "A": self.background["T"],
+            "C": self.background["G"],
+            "G": self.background["C"],
+            "T": self.background["A"],
+        }
+        res.pseudocounts = {
+            "A": self.pseudocounts["T"],
+            "C": self.pseudocounts["G"],
+            "G": self.pseudocounts["C"],
+            "T": self.pseudocounts["A"],
+        }
+        return res
+
+    @property
+    def consensus(self):
+        """Return the consensus sequence."""
+        return self.counts.consensus
+
+    @property
+    def anticonsensus(self):
+        """Return the least probable pattern to be generated from this motif."""
+        return self.counts.anticonsensus
+
+    @property
+    def degenerate_consensus(self):
+        """Return the degenerate consensus sequence.
+
+        Following the rules adapted from
+        D. R. Cavener: "Comparison of the consensus sequence flanking
+        translational start sites in Drosophila and vertebrates."
+        Nucleic Acids Research 15(4): 1353-1361. (1987).
+
+        The same rules are used by TRANSFAC.
+        """
+        return self.counts.degenerate_consensus
+
+    def weblogo(self, fname, fmt="PNG", version="2.8.2", **kwds):
+        """Download and save a weblogo using the Berkeley weblogo service.
+
+        Requires an internet connection.
+
+        The parameters from ``**kwds`` are passed directly to the weblogo server.
+
+        Currently, this method uses WebLogo version 3.3.
+        These are the arguments and their default values passed to
+        WebLogo 3.3; see their website at http://weblogo.threeplusone.com
+        for more information::
+
+            'stack_width' : 'medium',
+            'stacks_per_line' : '40',
+            'alphabet' : 'alphabet_dna',
+            'ignore_lower_case' : True,
+            'unit_name' : "bits",
+            'first_index' : '1',
+            'logo_start' : '1',
+            'logo_end': str(self.length),
+            'composition' : "comp_auto",
+            'percentCG' : '',
+            'scale_width' : True,
+            'show_errorbars' : True,
+            'logo_title' : '',
+            'logo_label' : '',
+            'show_xaxis': True,
+            'xaxis_label': '',
+            'show_yaxis': True,
+            'yaxis_label': '',
+            'yaxis_scale': 'auto',
+            'yaxis_tic_interval' : '1.0',
+            'show_ends' : True,
+            'show_fineprint' : True,
+            'color_scheme': 'color_auto',
+            'symbols0': '',
+            'symbols1': '',
+            'symbols2': '',
+            'symbols3': '',
+            'symbols4': '',
+            'color0': '',
+            'color1': '',
+            'color2': '',
+            'color3': '',
+            'color4': '',
+
+        """
+        if set(self.alphabet) == set("ACDEFGHIKLMNPQRSTVWY"):
+            alpha = "alphabet_protein"
+        elif set(self.alphabet) == set("ACGU"):
+            alpha = "alphabet_rna"
+        elif set(self.alphabet) == set("ACGT"):
+            alpha = "alphabet_dna"
+        else:
+            alpha = "auto"
+
+        frequencies = format(self, "transfac")
+        url = "http://weblogo.threeplusone.com/create.cgi"
+        values = {
+            "sequences": frequencies,
+            "format": fmt.lower(),
+            "stack_width": "medium",
+            "stacks_per_line": "40",
+            "alphabet": alpha,
+            "ignore_lower_case": True,
+            "unit_name": "bits",
+            "first_index": "1",
+            "logo_start": "1",
+            "logo_end": str(self.length),
+            "composition": "comp_auto",
+            "percentCG": "",
+            "scale_width": True,
+            "show_errorbars": True,
+            "logo_title": "",
+            "logo_label": "",
+            "show_xaxis": True,
+            "xaxis_label": "",
+            "show_yaxis": True,
+            "yaxis_label": "",
+            "yaxis_scale": "auto",
+            "yaxis_tic_interval": "1.0",
+            "show_ends": True,
+            "show_fineprint": True,
+            "color_scheme": "color_auto",
+            "symbols0": "",
+            "symbols1": "",
+            "symbols2": "",
+            "symbols3": "",
+            "symbols4": "",
+            "color0": "",
+            "color1": "",
+            "color2": "",
+            "color3": "",
+            "color4": "",
+        }
+
+        values.update({k: "" if v is False else str(v) for k, v in kwds.items()})
+        data = urlencode(values).encode("utf-8")
+        req = Request(url, data)
+        response = urlopen(req)
+        with open(fname, "wb") as f:
+            im = response.read()
+            f.write(im)
+
+    def __format__(self, format_spec):
+        """Return a string representation of the Motif in the given format.
+
+        Currently supported formats:
+         - clusterbuster: Cluster Buster position frequency matrix format
+         - pfm : JASPAR single Position Frequency Matrix
+         - jaspar : JASPAR multiple Position Frequency Matrix
+         - transfac : TRANSFAC like files
+
+        """
+        return self.format(format_spec)
+
+    def format(self, format_spec):
+        """Return a string representation of the Motif in the given format.
+
+        Currently supported formats:
+         - clusterbuster: Cluster Buster position frequency matrix format
+         - pfm : JASPAR single Position Frequency Matrix
+         - jaspar : JASPAR multiple Position Frequency Matrix
+         - transfac : TRANSFAC like files
+
+        """
+        if format_spec in ("pfm", "jaspar"):
+            from Bio.motifs import jaspar
+
+            motifs = [self]
+            return jaspar.write(motifs, format_spec)
+        elif format_spec == "transfac":
+            from Bio.motifs import transfac
+
+            motifs = [self]
+            return transfac.write(motifs)
+        elif format_spec == "clusterbuster":
+            from Bio.motifs import clusterbuster
+
+            motifs = [self]
+            return clusterbuster.write(motifs)
+        else:
+            raise ValueError("Unknown format type %s" % format_spec)
+
+
+def write(motifs, fmt):
+    """Return a string representation of motifs in the given format.
+
+    Currently supported formats (case is ignored):
+     - clusterbuster: Cluster Buster position frequency matrix format
+     - pfm : JASPAR simple single Position Frequency Matrix
+     - jaspar : JASPAR multiple PFM format
+     - transfac : TRANSFAC like files
+
+    """
+    fmt = fmt.lower()
+    if fmt in ("pfm", "jaspar"):
+        from Bio.motifs import jaspar
+
+        return jaspar.write(motifs, fmt)
+    elif fmt == "transfac":
+        from Bio.motifs import transfac
+
+        return transfac.write(motifs)
+    elif fmt == "clusterbuster":
+        from Bio.motifs import clusterbuster
+
+        return clusterbuster.write(motifs)
+    else:
+        raise ValueError("Unknown format type %s" % fmt)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/motifs/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..e06607d
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/alignace.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/alignace.cpython-37.pyc
new file mode 100644
index 0000000..71c1800
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/alignace.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/clusterbuster.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/clusterbuster.cpython-37.pyc
new file mode 100644
index 0000000..6d0cd44
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/clusterbuster.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/mast.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/mast.cpython-37.pyc
new file mode 100644
index 0000000..4b661e5
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/mast.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/matrix.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/matrix.cpython-37.pyc
new file mode 100644
index 0000000..71ef1d4
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/matrix.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/meme.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/meme.cpython-37.pyc
new file mode 100644
index 0000000..40aae17
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/meme.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/minimal.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/minimal.cpython-37.pyc
new file mode 100644
index 0000000..b85407c
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/minimal.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/pfm.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/pfm.cpython-37.pyc
new file mode 100644
index 0000000..868de47
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/pfm.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/thresholds.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/thresholds.cpython-37.pyc
new file mode 100644
index 0000000..f4dbf69
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/thresholds.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/transfac.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/transfac.cpython-37.pyc
new file mode 100644
index 0000000..d9a3fc9
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/transfac.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/__pycache__/xms.cpython-37.pyc b/code/lib/Bio/motifs/__pycache__/xms.cpython-37.pyc
new file mode 100644
index 0000000..c680baa
Binary files /dev/null and b/code/lib/Bio/motifs/__pycache__/xms.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/_pwm.c b/code/lib/Bio/motifs/_pwm.c
new file mode 100644
index 0000000..e29123b
--- /dev/null
+++ b/code/lib/Bio/motifs/_pwm.c
@@ -0,0 +1,216 @@
+/* Copyright 2009-2020 by Michiel de Hoon.  All rights reserved.
+ *
+ * This file is part of the Biopython distribution and governed by your
+ * choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+ * Please see the LICENSE file that should have been included as part of this
+ * package.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include 
+#include 
+
+
+static void
+calculate(const char sequence[], int s, Py_ssize_t m, double* matrix,
+          Py_ssize_t n, float* scores)
+{
+    Py_ssize_t i, j;
+    char c;
+    double score;
+    int ok;
+    float* p = scores;
+#ifndef NAN
+    float NAN = 0.0;
+    NAN /= NAN;
+#endif
+
+    for (i = 0; i < n; i++)
+    {
+        score = 0.0;
+        ok = 1;
+        for (j = 0; j < m; j++)
+        {
+            c = sequence[i+j];
+            switch (c)
+            {
+              /* Handling mixed case input here rather than converting it to
+                 uppercase in Python code first, since doing so could use too
+                 much memory if sequence is too long (e.g. chromosome or
+                 plasmid). */
+                case 'A':
+                case 'a':
+                    score += matrix[j*4+0]; break;
+                case 'C':
+                case 'c':
+                    score += matrix[j*4+1]; break;
+                case 'G':
+                case 'g':
+                    score += matrix[j*4+2]; break;
+                case 'T':
+                case 't':
+                    score += matrix[j*4+3]; break;
+                default:
+                    ok = 0;
+            }
+        }
+        if (ok) *p = (float)score;
+        else *p = NAN;
+        p++;
+    }
+}
+
+static int
+matrix_converter(PyObject* object, void* address)
+{
+    const int flags = PyBUF_C_CONTIGUOUS | PyBUF_FORMAT;
+    char datatype;
+    Py_buffer* view = address;
+
+    if (object == NULL) goto exit;
+    if (PyObject_GetBuffer(object, view, flags) == -1) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "position-weight matrix is not an array");
+        return 0;
+    }
+    datatype = view->format[0];
+    switch (datatype) {
+        case '@':
+        case '=':
+        case '<':
+        case '>':
+        case '!': datatype = view->format[1]; break;
+        default: break;
+    }
+    if (datatype != 'd') {
+        PyErr_Format(PyExc_RuntimeError,
+            "position-weight matrix data format incorrect "
+            "('%c', expected 'd')", datatype);
+        goto exit;
+    }
+    if (view->ndim != 2) {
+        PyErr_Format(PyExc_RuntimeError,
+            "position-weight matrix has incorrect rank (%d expected 2)",
+            view->ndim);
+        goto exit;
+    }
+    if (view->shape[1] != 4) {
+        PyErr_Format(PyExc_RuntimeError,
+            "position-weight matrix should have four columns "
+            "(%zd columns found)", view->shape[1]);
+        goto exit;
+    }
+    return Py_CLEANUP_SUPPORTED;
+
+exit:
+    PyBuffer_Release(view);
+    return 0;
+}
+
+static int
+scores_converter(PyObject* object, void* address)
+{
+    const int flags = PyBUF_C_CONTIGUOUS | PyBUF_FORMAT;
+    char datatype;
+    Py_buffer* view = address;
+
+    if (object == NULL) goto exit;
+    if (PyObject_GetBuffer(object, view, flags) == -1) return 0;
+    datatype = view->format[0];
+    switch (datatype) {
+        case '@':
+        case '=':
+        case '<':
+        case '>':
+        case '!': datatype = view->format[1]; break;
+        default: break;
+    }
+    if (datatype != 'f') {
+        PyErr_Format(PyExc_RuntimeError,
+            "scores array has incorrect data format ('%c', expected 'f')",
+            datatype);
+        goto exit;
+    }
+    if (view->ndim != 1) {
+        PyErr_Format(PyExc_ValueError,
+            "scores array has incorrect rank (%d expected 1)",
+            view->ndim);
+        goto exit;
+    }
+    return Py_CLEANUP_SUPPORTED;
+
+exit:
+    PyBuffer_Release(view);
+    return 0;
+}
+
+static char calculate__doc__[] =
+"    calculate(sequence, pwm, scores)\n"
+"\n"
+"This function calculates the position-weight matrix scores for all\n"
+"positions along the sequence for position-weight matrix pwm, and stores\n"
+"them in the provided numpy array scores.\n";
+
+static PyObject*
+py_calculate(PyObject* self, PyObject* args, PyObject* keywords)
+{
+    const char* sequence;
+    static char* kwlist[] = {"sequence", "matrix", "scores", NULL};
+    Py_ssize_t m;
+    Py_ssize_t n;
+    Py_ssize_t s;
+    PyObject* result = NULL;
+    Py_buffer scores;
+    Py_buffer matrix;
+
+    matrix.obj = NULL;
+    scores.obj = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#O&O&", kwlist,
+                                     &sequence, &s,
+                                     matrix_converter, &matrix,
+                                     scores_converter, &scores)) return NULL;
+    m = matrix.shape[0];
+    n = scores.shape[0];
+    if (n == s - m + 1) {
+        calculate(sequence, s, m, matrix.buf, n, scores.buf);
+        Py_INCREF(Py_None);
+        result = Py_None;
+    }
+    else {
+        PyErr_Format(PyExc_RuntimeError,
+                    "size of scores array is inconsistent "
+                    "(sequence length is %zd, "
+                    "motif length is %zd, scores length is %zd", s, m, n);
+    }
+
+    matrix_converter(NULL, &matrix);
+    scores_converter(NULL, &scores);
+    return result;
+}
+
+static struct PyMethodDef methods[] = {
+   {"calculate",
+    (PyCFunction)py_calculate,
+    METH_VARARGS | METH_KEYWORDS,
+    PyDoc_STR(calculate__doc__),
+   },
+   {NULL, NULL, 0, NULL} // sentinel
+};
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_pwm",
+    PyDoc_STR("Fast calculations involving position-weight matrices"),
+    -1,
+    methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+PyObject*
+PyInit__pwm(void)
+{
+    return PyModule_Create(&moduledef);
+}
diff --git a/code/lib/Bio/motifs/_pwm.cp37-win_amd64.pyd b/code/lib/Bio/motifs/_pwm.cp37-win_amd64.pyd
new file mode 100644
index 0000000..89817dc
Binary files /dev/null and b/code/lib/Bio/motifs/_pwm.cp37-win_amd64.pyd differ
diff --git a/code/lib/Bio/motifs/alignace.py b/code/lib/Bio/motifs/alignace.py
new file mode 100644
index 0000000..42d5e0a
--- /dev/null
+++ b/code/lib/Bio/motifs/alignace.py
@@ -0,0 +1,67 @@
+# Copyright 2003 by Bartek Wilczynski.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Parsing AlignACE output files."""
+
+from Bio.motifs import Motif, Instances
+from Bio.Seq import Seq
+
+
+class Record(list):
+    """AlignACE record (subclass of Python list)."""
+
+    def __init__(self):
+        """Initialize the class."""
+        self.parameters = None
+
+
+def read(handle):
+    """Parse an AlignACE format handle as a Record object."""
+    record = Record()
+    line = next(handle)
+    record.version = line.strip()
+    line = next(handle)
+    record.command = line.strip()
+    mask = None
+    number = None
+    for line in handle:
+        line = line.strip()
+        if line == "":
+            pass
+        elif line[:4] == "Para":
+            record.parameters = {}
+        elif line[0] == "#":
+            seq_name = line.split("\t")[1]
+            record.sequences.append(seq_name)
+        elif "=" in line:
+            par_name, par_value = line.split("=")
+            par_name = par_name.strip()
+            par_value = par_value.strip()
+            record.parameters[par_name] = par_value
+        elif line[:5] == "Input":
+            record.sequences = []
+        elif line[:5] == "Motif":
+            words = line.split()
+            assert words[0] == "Motif"
+            number = int(words[1])
+            instances = []
+        elif line[:3] == "MAP":
+            alphabet = "ACGT"
+            instances = Instances(instances, alphabet)
+            motif = Motif(alphabet, instances)
+            motif.score = float(line.split()[-1])
+            motif.number = number
+            motif.mask = mask
+            record.append(motif)
+        elif len(line.split("\t")) == 4:
+            seq = Seq(line.split("\t")[0])
+            instances.append(seq)
+        elif "*" in line:
+            mask = line.strip("\r\n")
+        else:
+            raise ValueError(line)
+    return record
diff --git a/code/lib/Bio/motifs/applications/__init__.py b/code/lib/Bio/motifs/applications/__init__.py
new file mode 100644
index 0000000..0832554
--- /dev/null
+++ b/code/lib/Bio/motifs/applications/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2009 by Bartek Wilczynski.  All rights reserved.
+# Revisions copyright 2009 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Motif command line tool wrappers (OBSOLETE).
+
+We have decided to remove this module in future, and instead recommend
+building your command and invoking it via the subprocess module directly.
+"""
+
+from ._xxmotif import XXmotifCommandline
diff --git a/code/lib/Bio/motifs/applications/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/motifs/applications/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..17fcc25
Binary files /dev/null and b/code/lib/Bio/motifs/applications/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/applications/__pycache__/_xxmotif.cpython-37.pyc b/code/lib/Bio/motifs/applications/__pycache__/_xxmotif.cpython-37.pyc
new file mode 100644
index 0000000..9b94dad
Binary files /dev/null and b/code/lib/Bio/motifs/applications/__pycache__/_xxmotif.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/applications/_xxmotif.py b/code/lib/Bio/motifs/applications/_xxmotif.py
new file mode 100644
index 0000000..6ff8cf0
--- /dev/null
+++ b/code/lib/Bio/motifs/applications/_xxmotif.py
@@ -0,0 +1,261 @@
+# Copyright 2012 by Christian Brueffer.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Command line wrapper for the motif finding program XXmotif."""
+
+
+import os
+from Bio.Application import AbstractCommandline, _Option, _Switch, _Argument
+
+
+class XXmotifCommandline(AbstractCommandline):
+    """Command line wrapper for XXmotif.
+
+    http://xxmotif.genzentrum.lmu.de/
+
+    Notes
+    -----
+    Last checked against version: 1.3
+
+    References
+    ----------
+    Luehr S, Hartmann H, and Söding J. The XXmotif web server for eXhaustive,
+    weight matriX-based motif discovery in nucleotide sequences,
+    Nucleic Acids Res. 40: W104-W109 (2012).
+
+    Hartmann H, Guthoehrlein EW, Siebert M., Luehr S, and Söding J. P-value
+    based regulatory motif discovery using positional weight matrices,
+    Genome Res. 23: 181–194 (2013)
+
+    Examples
+    --------
+    >>> from Bio.motifs.applications import XXmotifCommandline
+    >>> out_dir = "results"
+    >>> in_file = "sequences.fasta"
+    >>> xxmotif_cline = XXmotifCommandline(outdir=out_dir, seqfile=in_file, revcomp=True)
+    >>> print(xxmotif_cline)
+    XXmotif results sequences.fasta --revcomp
+
+    You would typically run the command line with xxmotif_cline() or via
+    the Python subprocess module, as described in the Biopython tutorial.
+
+    """
+
+    def __init__(self, cmd="XXmotif", **kwargs):
+        """Initialize the class."""
+        # order of parameters is the same as in XXmotif --help
+        _valid_alphabet = set("ACGTNX")
+
+        self.parameters = [
+            _Argument(
+                ["outdir", "OUTDIR"],
+                "output directory for all results",
+                filename=True,
+                is_required=True,
+                # XXmotif currently does not accept spaces in the outdir name
+                checker_function=lambda x: " " not in x,
+            ),
+            _Argument(
+                ["seqfile", "SEQFILE"],
+                "file name with sequences from positive set in FASTA format",
+                filename=True,
+                is_required=True,
+                # XXmotif currently only accepts a pure filename
+                checker_function=lambda x: os.path.split(x)[0] == "",
+            ),
+            # Options
+            _Option(
+                ["--negSet", "negSet", "NEGSET", "negset"],
+                "sequence set which has to be used as a reference set",
+                filename=True,
+                equate=False,
+            ),
+            _Switch(
+                ["--zoops", "ZOOPS", "zoops"],
+                "use zero-or-one occurrence per sequence model (DEFAULT)",
+            ),
+            _Switch(
+                ["--mops", "MOPS", "mops"], "use multiple occurrence per sequence model"
+            ),
+            _Switch(
+                ["--oops", "OOPS", "oops"], "use one occurrence per sequence model"
+            ),
+            _Switch(
+                ["--revcomp", "REVCOMP", "revcomp"],
+                "search in reverse complement of sequences as well (DEFAULT: NO)",
+            ),
+            _Option(
+                [
+                    "--background-model-order",
+                    "background-model-order",
+                    "BACKGROUND-MODEL-ORDER",
+                    "background_model_order",
+                ],
+                "order of background distribution (DEFAULT: 2, 8(--negset) )",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["--pseudo", "PSEUDO", "pseudo"],
+                "percentage of pseudocounts used (DEFAULT: 10)",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["-g", "--gaps", "GAPS", "gaps"],
+                "maximum number of gaps used for start seeds [0-3] (DEFAULT: 0)",
+                checker_function=lambda x: x in [0 - 3],
+                equate=False,
+            ),
+            _Option(
+                ["--type", "TYPE", "type"],
+                "defines what kind of start seeds are used (DEFAULT: ALL)"
+                "possible types: ALL, FIVEMERS, PALINDROME, TANDEM, NOPALINDROME, NOTANDEM",
+                checker_function=lambda x: x
+                in [
+                    "ALL",
+                    "all",
+                    "FIVEMERS",
+                    "fivemers",
+                    "PALINDROME",
+                    "palindrome",
+                    "TANDEM",
+                    "tandem",
+                    "NOPALINDROME",
+                    "nopalindrome",
+                    "NOTANDEM",
+                    "notandem",
+                ],
+                equate=False,
+            ),
+            _Option(
+                [
+                    "--merge-motif-threshold",
+                    "merge-motif-threshold",
+                    "MERGE-MOTIF-THRESHOLD",
+                    "merge_motif_threshold",
+                ],
+                "defines the similarity threshold for merging motifs (DEFAULT: HIGH)"
+                "possible modes: LOW, MEDIUM, HIGH",
+                checker_function=lambda x: x
+                in ["LOW", "low", "MEDIUM", "medium", "HIGH", "high"],
+                equate=False,
+            ),
+            _Switch(
+                [
+                    "--no-pwm-length-optimization",
+                    "no-pwm-length-optimization",
+                    "NO-PWM-LENGTH-OPTIMIZATION",
+                    "no_pwm_length_optimization",
+                ],
+                "do not optimize length during iterations (runtime advantages)",
+            ),
+            _Option(
+                [
+                    "--max-match-positions",
+                    "max-match-positions",
+                    "MAX-MATCH-POSITIONS",
+                    "max_match_positions",
+                ],
+                "max number of positions per motif (DEFAULT: 17, higher values will lead to very long runtimes)",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Switch(
+                ["--batch", "BATCH", "batch"],
+                "suppress progress bars (reduce output size for batch jobs)",
+            ),
+            _Option(
+                ["--maxPosSetSize", "maxPosSetSize", "MAXPOSSETSIZE", "maxpossetsize"],
+                "maximum number of sequences from the positive set used [DEFAULT: all]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # does not make sense in biopython
+            # _Switch(["--help", "help", "HELP"],
+            #         "print this help page"),
+            _Option(
+                ["--trackedMotif", "trackedMotif", "TRACKEDMOTIF", "trackedmotif"],
+                "inspect extensions and refinement of a given seed (DEFAULT: not used)",
+                checker_function=lambda x: any((c in _valid_alphabet) for c in x),
+                equate=False,
+            ),
+            # Using conservation information
+            _Option(
+                ["--format", "FORMAT", "format"],
+                "defines what kind of format the input sequences have (DEFAULT: FASTA)",
+                checker_function=lambda x: x in ["FASTA", "fasta", "MFASTA", "mfasta"],
+                equate=False,
+            ),
+            _Option(
+                [
+                    "--maxMultipleSequences",
+                    "maxMultipleSequences",
+                    "MAXMULTIPLESEQUENCES",
+                    "maxmultiplesequences",
+                ],
+                "maximum number of sequences used in an alignment [DEFAULT: all]",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # Using localization information
+            _Switch(
+                ["--localization", "LOCALIZATION", "localization"],
+                "use localization information to calculate combined P-values"
+                "(sequences should have all the same length)",
+            ),
+            _Option(
+                ["--downstream", "DOWNSTREAM", "downstream"],
+                "number of residues in positive set downstream of anchor point (DEFAULT: 0)",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # Start with self defined motif
+            _Option(
+                ["-m", "--startMotif", "startMotif", "STARTMOTIF", "startmotif"],
+                "Start motif (IUPAC characters)",
+                checker_function=lambda x: any((c in _valid_alphabet) for c in x),
+                equate=False,
+            ),
+            _Option(
+                ["-p", "--profileFile", "profileFile", "PROFILEFILE", "profilefile"],
+                "profile file",
+                filename=True,
+                equate=False,
+            ),
+            _Option(
+                ["--startRegion", "startRegion", "STARTREGION", "startregion"],
+                "expected start position for motif occurrences relative to anchor point (--localization)",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            _Option(
+                ["--endRegion", "endRegion", "ENDREGION", "endregion"],
+                "expected end position for motif occurrences relative to anchor point (--localization)",
+                checker_function=lambda x: isinstance(x, int),
+                equate=False,
+            ),
+            # XXmotif wrapper options
+            _Switch(
+                ["--XXmasker", "masker"],
+                "mask the input sequences for homology, repeats and low complexity regions",
+            ),
+            _Switch(
+                ["--XXmasker-pos", "maskerpos"],
+                "mask only the positive set for homology, repeats and low complexity regions",
+            ),
+            _Switch(
+                ["--no-graphics", "nographics"], "run XXmotif without graphical output"
+            ),
+        ]
+        AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/motifs/clusterbuster.py b/code/lib/Bio/motifs/clusterbuster.py
new file mode 100644
index 0000000..8340fea
--- /dev/null
+++ b/code/lib/Bio/motifs/clusterbuster.py
@@ -0,0 +1,80 @@
+# Copyright 2015 by Gert Hulselmans.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Parse Cluster Buster position frequency matrix files."""
+
+from Bio import motifs
+
+
+class Record(list):
+    """Class to store the information in a Cluster Buster matrix table.
+
+    The record inherits from a list containing the individual motifs.
+    """
+
+    def __str__(self):
+        return "\n".join(str(motif) for motif in self)
+
+
+def read(handle):
+    """Read motifs in Cluster Buster position frequency matrix format from a file handle.
+
+    Cluster Buster motif format: http://zlab.bu.edu/cluster-buster/help/cis-format.html
+    """
+    motif_nbr = 0
+    record = Record()
+    nucleotide_counts = {"A": [], "C": [], "G": [], "T": []}
+    motif_name = ""
+
+    for line in handle:
+        line = line.strip()
+        if line:
+            if line.startswith(">"):
+
+                if motif_nbr != 0:
+                    motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+                    motif.name = motif_name
+                    record.append(motif)
+
+                motif_name = line[1:].strip()
+                nucleotide_counts = {"A": [], "C": [], "G": [], "T": []}
+                motif_nbr += 1
+            else:
+                if line.startswith("#"):
+                    continue
+
+                matrix_columns = line.split()
+
+                if len(matrix_columns) == 4:
+                    [
+                        nucleotide_counts[nucleotide].append(float(nucleotide_count))
+                        for nucleotide, nucleotide_count in zip(
+                            ["A", "C", "G", "T"], matrix_columns
+                        )
+                    ]
+
+    motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+    motif.name = motif_name
+    record.append(motif)
+
+    return record
+
+
+def write(motifs):
+    """Return the representation of motifs in Cluster Buster position frequency matrix format."""
+    lines = []
+    for m in motifs:
+        line = f">{m.name}\n"
+        lines.append(line)
+        for ACGT_counts in zip(
+            m.counts["A"], m.counts["C"], m.counts["G"], m.counts["T"]
+        ):
+            lines.append("{:0.0f}\t{:0.0f}\t{:0.0f}\t{:0.0f}\n".format(*ACGT_counts))
+
+    # Finished; glue the lines together.
+    text = "".join(lines)
+
+    return text
diff --git a/code/lib/Bio/motifs/jaspar/__init__.py b/code/lib/Bio/motifs/jaspar/__init__.py
new file mode 100644
index 0000000..fc09b1e
--- /dev/null
+++ b/code/lib/Bio/motifs/jaspar/__init__.py
@@ -0,0 +1,372 @@
+# Copyright 2013 by Anthony Mathelier and David Arenillas. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""JASPAR2014 module."""
+
+from Bio.Seq import Seq
+import re
+import math
+
+from Bio import motifs
+
+
+class Motif(motifs.Motif):
+    """A subclass of Bio.motifs.Motif used to represent a JASPAR profile.
+
+    Additional metadata information are stored if available. The metadata
+    availability depends on the source of the JASPAR motif (a 'pfm' format
+    file, a 'jaspar' format file or a JASPAR database).
+    """
+
+    def __init__(
+        self,
+        matrix_id,
+        name,
+        alphabet="ACGT",
+        instances=None,
+        counts=None,
+        collection=None,
+        tf_class=None,
+        tf_family=None,
+        species=None,
+        tax_group=None,
+        acc=None,
+        data_type=None,
+        medline=None,
+        pazar_id=None,
+        comment=None,
+    ):
+        """Construct a JASPAR Motif instance."""
+        motifs.Motif.__init__(self, alphabet, instances, counts)
+        self.name = name
+        self.matrix_id = matrix_id
+        self.collection = collection
+        self.tf_class = tf_class
+        self.tf_family = tf_family
+        # May have multiple so species is a list.
+        # The species are actually specified as
+        # taxonomy IDs.
+        self.species = species
+        self.tax_group = tax_group
+        self.acc = acc  # May have multiple so acc is a list.
+        self.data_type = data_type
+        self.medline = medline
+        self.pazar_id = pazar_id
+        self.comment = comment
+
+    @property
+    def base_id(self):
+        """Return the JASPAR base matrix ID."""
+        (base_id, __) = split_jaspar_id(self.matrix_id)
+        return base_id
+
+    @property
+    def version(self):
+        """Return the JASPAR matrix version."""
+        (__, version) = split_jaspar_id(self.matrix_id)
+        return version
+
+    def __str__(self):
+        """Return a string represention of the JASPAR profile.
+
+        We choose to provide only the filled metadata information.
+        """
+        tf_name_str = f"TF name\t{self.name}\n"
+        matrix_id_str = f"Matrix ID\t{self.matrix_id}\n"
+        the_string = "".join([tf_name_str, matrix_id_str])
+        if self.collection:
+            collection_str = f"Collection\t{self.collection}\n"
+            the_string = "".join([the_string, collection_str])
+        if self.tf_class:
+            tf_class_str = f"TF class\t{self.tf_class}\n"
+            the_string = "".join([the_string, tf_class_str])
+        if self.tf_family:
+            tf_family_str = f"TF family\t{self.tf_family}\n"
+            the_string = "".join([the_string, tf_family_str])
+        if self.species:
+            species_str = f"Species\t{','.join(self.species)}\n"
+            the_string = "".join([the_string, species_str])
+        if self.tax_group:
+            tax_group_str = f"Taxonomic group\t{self.tax_group}\n"
+            the_string = "".join([the_string, tax_group_str])
+        if self.acc:
+            acc_str = f"Accession\t{self.acc}\n"
+            the_string = "".join([the_string, acc_str])
+        if self.data_type:
+            data_type_str = f"Data type used\t{self.data_type}\n"
+            the_string = "".join([the_string, data_type_str])
+        if self.medline:
+            medline_str = f"Medline\t{self.medline}\n"
+            the_string = "".join([the_string, medline_str])
+        if self.pazar_id:
+            pazar_id_str = f"PAZAR ID\t{self.pazar_id}\n"
+            the_string = "".join([the_string, pazar_id_str])
+        if self.comment:
+            comment_str = f"Comments\t{self.comment}\n"
+            the_string = "".join([the_string, comment_str])
+        matrix_str = f"Matrix:\n{self.counts}\n\n"
+        the_string = "".join([the_string, matrix_str])
+        return the_string
+
+    def __hash__(self):
+        """Return the hash key corresponding to the JASPAR profile.
+
+        :note: We assume the unicity of matrix IDs
+
+        """
+        return self.matrix_id.__hash__()
+
+    def __eq__(self, other):
+        """Return True if matrix IDs are the same."""
+        return self.matrix_id == other.matrix_id
+
+
+class Record(list):
+    """Represent a list of jaspar motifs.
+
+    Attributes:
+     - version: The JASPAR version used
+
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.version = None
+
+    def __str__(self):
+        """Return a string of all motifs in the Record."""
+        return "\n".join(str(the_motif) for the_motif in self)
+
+    def to_dict(self):
+        """Return the list of matrices as a dictionary of matrices."""
+        dic = {}
+        for motif in self:
+            dic[motif.matrix_id] = motif
+        return dic
+
+
+def read(handle, format):
+    """Read motif(s) from a file in one of several different JASPAR formats.
+
+    Return the record of PFM(s).
+    Call the appropriate routine based on the format passed.
+    """
+    format = format.lower()
+    if format == "pfm":
+        record = _read_pfm(handle)
+        return record
+    elif format == "sites":
+        record = _read_sites(handle)
+        return record
+    elif format == "jaspar":
+        record = _read_jaspar(handle)
+        return record
+    else:
+        raise ValueError("Unknown JASPAR format %s" % format)
+
+
+def write(motifs, format):
+    """Return the representation of motifs in "pfm" or "jaspar" format."""
+    letters = "ACGT"
+    lines = []
+    if format == "pfm":
+        motif = motifs[0]
+        counts = motif.counts
+        for letter in letters:
+            terms = [f"{value:6.2f}" for value in counts[letter]]
+            line = f"{' '.join(terms)}\n"
+            lines.append(line)
+    elif format == "jaspar":
+        for m in motifs:
+            counts = m.counts
+            try:
+                matrix_id = m.matrix_id
+            except AttributeError:
+                matrix_id = None
+            line = f">{matrix_id} {m.name}\n"
+            lines.append(line)
+            for letter in letters:
+                terms = [f"{value:6.2f}" for value in counts[letter]]
+                line = f"{letter} [{' '.join(terms)}]\n"
+                lines.append(line)
+    else:
+        raise ValueError("Unknown JASPAR format %s" % format)
+
+    # Finished; glue the lines together
+    text = "".join(lines)
+
+    return text
+
+
+def _read_pfm(handle):
+    """Read the motif from a JASPAR .pfm file (PRIVATE)."""
+    alphabet = "ACGT"
+    counts = {}
+
+    for letter, line in zip(alphabet, handle):
+        words = line.split()
+        # if there is a letter in the beginning, ignore it
+        if words[0] == letter:
+            words = words[1:]
+        counts[letter] = [float(x) for x in words]
+
+    motif = Motif(matrix_id=None, name=None, alphabet=alphabet, counts=counts)
+    motif.mask = "*" * motif.length
+    record = Record()
+    record.append(motif)
+
+    return record
+
+
+def _read_sites(handle):
+    """Read the motif from JASPAR .sites file (PRIVATE)."""
+    alphabet = "ACGT"
+    instances = []
+
+    for line in handle:
+        if not line.startswith(">"):
+            break
+        # line contains the header ">...."
+        # now read the actual sequence
+        line = next(handle)
+        instance = ""
+        for c in line.strip():
+            if c == c.upper():
+                instance += c
+        instance = Seq(instance)
+        instances.append(instance)
+
+    instances = motifs.Instances(instances, alphabet)
+    motif = Motif(matrix_id=None, name=None, alphabet=alphabet, instances=instances)
+    motif.mask = "*" * motif.length
+    record = Record()
+    record.append(motif)
+
+    return record
+
+
+def _read_jaspar(handle):
+    """Read motifs from a JASPAR formatted file (PRIVATE).
+
+    Format is one or more records of the form, e.g.::
+
+      - JASPAR 2010 matrix_only format::
+
+                >MA0001.1 AGL3
+                A  [ 0  3 79 40 66 48 65 11 65  0 ]
+                C  [94 75  4  3  1  2  5  2  3  3 ]
+                G  [ 1  0  3  4  1  0  5  3 28 88 ]
+                T  [ 2 19 11 50 29 47 22 81  1  6 ]
+
+      - JASPAR 2010-2014 PFMs format::
+
+                >MA0001.1 AGL3
+                0	3	79	40	66	48	65	11	65	0
+                94	75	4	3	1	2	5	2	3	3
+                1	0	3	4	1	0	5	3	28	88
+                2	19	11	50	29	47	22	81	1	6
+
+    """
+    alphabet = "ACGT"
+    counts = {}
+
+    record = Record()
+
+    head_pat = re.compile(r"^>\s*(\S+)(\s+(\S+))?")
+    row_pat_long = re.compile(r"\s*([ACGT])\s*\[\s*(.*)\s*\]")
+    row_pat_short = re.compile(r"\s*(.+)\s*")
+
+    identifier = None
+    name = None
+    row_count = 0
+    nucleotides = ["A", "C", "G", "T"]
+    for line in handle:
+        line = line.strip()
+
+        head_match = head_pat.match(line)
+        row_match_long = row_pat_long.match(line)
+        row_match_short = row_pat_short.match(line)
+
+        if head_match:
+            identifier = head_match.group(1)
+            if head_match.group(3):
+                name = head_match.group(3)
+            else:
+                name = identifier
+        elif row_match_long:
+            (letter, counts_str) = row_match_long.group(1, 2)
+            words = counts_str.split()
+            counts[letter] = [float(x) for x in words]
+            row_count += 1
+            if row_count == 4:
+                record.append(Motif(identifier, name, alphabet=alphabet, counts=counts))
+                identifier = None
+                name = None
+                counts = {}
+                row_count = 0
+        elif row_match_short:
+            words = row_match_short.group(1).split()
+            counts[nucleotides[row_count]] = [float(x) for x in words]
+            row_count += 1
+            if row_count == 4:
+                record.append(Motif(identifier, name, alphabet=alphabet, counts=counts))
+                identifier = None
+                name = None
+                counts = {}
+                row_count = 0
+
+    return record
+
+
+def calculate_pseudocounts(motif):
+    """Calculate pseudocounts.
+
+    Computes the root square of the total number of sequences multiplied by
+    the background nucleotide.
+    """
+    alphabet = motif.alphabet
+    background = motif.background
+
+    # It is possible to have unequal column sums so use the average
+    # number of instances.
+    total = 0
+    for i in range(motif.length):
+        total += sum(float(motif.counts[letter][i]) for letter in alphabet)
+
+    avg_nb_instances = total / motif.length
+    sq_nb_instances = math.sqrt(avg_nb_instances)
+
+    if background:
+        background = dict(background)
+    else:
+        background = dict.fromkeys(sorted(alphabet), 1.0)
+
+    total = sum(background.values())
+    pseudocounts = {}
+
+    for letter in alphabet:
+        background[letter] /= total
+        pseudocounts[letter] = sq_nb_instances * background[letter]
+
+    return pseudocounts
+
+
+def split_jaspar_id(id):
+    """Split a JASPAR matrix ID into its component.
+
+    Components are base ID and version number, e.g. 'MA0047.2' is returned as
+    ('MA0047', 2).
+    """
+    id_split = id.split(".")
+
+    base_id = None
+    version = None
+    if len(id_split) == 2:
+        base_id = id_split[0]
+        version = id_split[1]
+    else:
+        base_id = id
+
+    return (base_id, version)
diff --git a/code/lib/Bio/motifs/jaspar/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/motifs/jaspar/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..ee0d260
Binary files /dev/null and b/code/lib/Bio/motifs/jaspar/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/jaspar/__pycache__/db.cpython-37.pyc b/code/lib/Bio/motifs/jaspar/__pycache__/db.cpython-37.pyc
new file mode 100644
index 0000000..3732919
Binary files /dev/null and b/code/lib/Bio/motifs/jaspar/__pycache__/db.cpython-37.pyc differ
diff --git a/code/lib/Bio/motifs/jaspar/db.py b/code/lib/Bio/motifs/jaspar/db.py
new file mode 100644
index 0000000..fc27305
--- /dev/null
+++ b/code/lib/Bio/motifs/jaspar/db.py
@@ -0,0 +1,776 @@
+# Copyright 2013 by David Arenillas and Anthony Mathelier. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Provides read access to a JASPAR5 formatted database.
+
+This modules requires MySQLdb to be installed.
+
+Example, substitute the your database credentials as
+appropriate::
+
+        from Bio.motifs.jaspar.db import JASPAR5
+        JASPAR_DB_HOST = "hostname.example.org"
+        JASPAR_DB_NAME = "JASPAR2018"
+        JASPAR_DB_USER = "guest"
+        JASPAR_DB_PASS = "guest"
+
+        jdb = JASPAR5(
+            host=JASPAR_DB_HOST,
+            name=JASPAR_DB_NAME,
+            user=JASPAR_DB_USER,
+            password=JASPAR_DB_PASS
+        )
+        ets1 = jdb.fetch_motif_by_id('MA0098')
+        print(ets1)
+    TF name ETS1
+    Matrix ID   MA0098.3
+    Collection  CORE
+    TF class    Tryptophan cluster factors
+    TF family   Ets-related factors
+    Species 9606
+    Taxonomic group vertebrates
+    Accession   ['P14921']
+    Data type used  HT-SELEX
+    Medline 20517297
+    PAZAR ID    TF0000070
+    Comments    Data is from Taipale HTSELEX DBD (2013)
+    Matrix:
+            0      1      2      3      4      5      6      7      8      9
+    A: 2683.00 180.00 425.00   0.00   0.00 2683.00 2683.00 1102.00  89.00 803.00
+    C: 210.00 2683.00 2683.00  21.00   0.00   0.00   9.00  21.00 712.00 401.00
+    G: 640.00 297.00   7.00 2683.00 2683.00   0.00  31.00 1580.00 124.00 1083.00
+    T: 241.00  22.00   0.00   0.00  12.00   0.00 909.00  12.00 1970.00 396.00
+
+        motifs = jdb.fetch_motifs(
+            collection = 'CORE',
+            tax_group = ['vertebrates', 'insects'],
+            tf_class = 'Homeo domain factors',
+            tf_family = ['TALE-type homeo domain factors', 'POU domain factors'],
+            min_ic = 12
+        )
+        for motif in motifs:
+            pass # do something with the motif
+"""
+
+
+import warnings
+from Bio import BiopythonWarning
+from Bio import MissingPythonDependencyError
+
+try:
+    import MySQLdb as mdb
+except ImportError:
+    raise MissingPythonDependencyError(
+        "Install MySQLdb if you want to use Bio.motifs.jaspar.db"
+    )
+
+from Bio.motifs import jaspar, matrix
+
+
+JASPAR_DFLT_COLLECTION = "CORE"
+
+
+class JASPAR5:
+    """Class representing a JASPAR5 database.
+
+    Class representing a JASPAR5 DB. The methods within are loosely based
+    on the perl TFBS::DB::JASPAR5 module.
+
+    Note: We will only implement reading of JASPAR motifs from the DB.
+    Unlike the perl module, we will not attempt to implement any methods to
+    store JASPAR motifs or create a new DB at this time.
+    """
+
+    def __init__(self, host=None, name=None, user=None, password=None):
+        """Construct a JASPAR5 instance and connect to specified DB.
+
+        Arguments:
+         - host - host name of the the JASPAR DB server
+         - name - name of the JASPAR database
+         - user - user name to connect to the JASPAR DB
+         - password - JASPAR DB password
+
+        """
+        self.name = name
+        self.host = host
+        self.user = user
+        self.password = password
+
+        self.dbh = mdb.connect(host, user, password, name)
+
+    def __str__(self):
+        """Return a string represention of the JASPAR5 DB connection."""
+        return r"%s\@%s:%s" % (self.user, self.host, self.name)
+
+    def fetch_motif_by_id(self, id):
+        """Fetch a single JASPAR motif from the DB by its JASPAR matrix ID.
+
+        Example id 'MA0001.1'.
+
+        Arguments:
+         - id - JASPAR matrix ID. This may be a fully specified ID including
+                the version number (e.g. MA0049.2) or just the base ID (e.g.
+                MA0049). If only a base ID is provided, the latest version is
+                returned.
+
+        Returns:
+         - A Bio.motifs.jaspar.Motif object
+
+        **NOTE:** The perl TFBS module allows you to specify the type of matrix
+        to return (PFM, PWM, ICM) but matrices are always stored in JASPAR as
+        PFMs so this does not really belong here. Once a PFM is fetched the
+        pwm() and pssm() methods can be called to return the normalized and
+        log-odds matrices.
+
+        """
+        # separate stable ID and version number
+        (base_id, version) = jaspar.split_jaspar_id(id)
+        if not version:
+            # if ID contains no version portion, fetch the latest version
+            version = self._fetch_latest_version(base_id)
+
+        # fetch internal JASPAR matrix ID - also a check for validity
+        int_id = None
+        if version:
+            int_id = self._fetch_internal_id(base_id, version)
+
+        # fetch JASPAR motif using internal ID
+        motif = None
+        if int_id:
+            motif = self._fetch_motif_by_internal_id(int_id)
+
+        return motif
+
+    def fetch_motifs_by_name(self, name):
+        """Fetch a list of JASPAR motifs from a JASPAR DB by the given TF name(s).
+
+        Arguments:
+        name - a single name or list of names
+        Returns:
+        A list of Bio.motifs.jaspar.Motif objects
+
+        Notes:
+        Names are not guaranteed to be unique. There may be more than one
+        motif with the same name. Therefore even if name specifies a single
+        name, a list of motifs is returned. This just calls
+        self.fetch_motifs(collection = None, tf_name = name).
+
+        This behaviour is different from the TFBS perl module's
+        get_Matrix_by_name() method which always returns a single matrix,
+        issuing a warning message and returning the first matrix retrieved
+        in the case where multiple matrices have the same name.
+
+        """
+        return self.fetch_motifs(collection=None, tf_name=name)
+
+    def fetch_motifs(
+        self,
+        collection=JASPAR_DFLT_COLLECTION,
+        tf_name=None,
+        tf_class=None,
+        tf_family=None,
+        matrix_id=None,
+        tax_group=None,
+        species=None,
+        pazar_id=None,
+        data_type=None,
+        medline=None,
+        min_ic=0,
+        min_length=0,
+        min_sites=0,
+        all=False,
+        all_versions=False,
+    ):
+        """Fetch jaspar.Record (list) of motifs using selection criteria.
+
+        Arguments::
+
+            Except where obvious, all selection criteria arguments may be
+            specified as a single value or a list of values. Motifs must
+            meet ALL the specified selection criteria to be returned with
+            the precedent exceptions noted below.
+
+            all         - Takes precedent of all other selection criteria.
+                          Every motif is returned. If 'all_versions' is also
+                          specified, all versions of every motif are returned,
+                          otherwise just the latest version of every motif is
+                          returned.
+            matrix_id   - Takes precedence over all other selection criteria
+                          except 'all'.  Only motifs with the given JASPAR
+                          matrix ID(s) are returned. A matrix ID may be
+                          specified as just a base ID or full JASPAR IDs
+                          including version number. If only a base ID is
+                          provided for specific motif(s), then just the latest
+                          version of those motif(s) are returned unless
+                          'all_versions' is also specified.
+            collection  - Only motifs from the specified JASPAR collection(s)
+                          are returned. NOTE - if not specified, the collection
+                          defaults to CORE for all other selection criteria
+                          except 'all' and 'matrix_id'. To apply the other
+                          selection criteria across all JASPAR collections,
+                          explicitly set collection=None.
+            tf_name     - Only motifs with the given name(s) are returned.
+            tf_class    - Only motifs of the given TF class(es) are returned.
+            tf_family   - Only motifs from the given TF families are returned.
+            tax_group   - Only motifs belonging to the given taxonomic
+                          supergroups are returned (e.g. 'vertebrates',
+                          'insects', 'nematodes' etc.)
+            species     - Only motifs derived from the given species are
+                          returned.  Species are specified as taxonomy IDs.
+            data_type   - Only motifs generated with the given data type (e.g.
+                          ('ChIP-seq', 'PBM', 'SELEX' etc.) are returned.
+                          NOTE - must match exactly as stored in the database.
+            pazar_id    - Only motifs with the given PAZAR TF ID are returned.
+            medline     - Only motifs with the given medline (PubmMed IDs) are
+                          returned.
+            min_ic      - Only motifs whose profile matrices have at least this
+                          information content (specificty) are returned.
+            min_length  - Only motifs whose profiles are of at least this
+                          length are returned.
+            min_sites   - Only motifs compiled from at least these many binding
+                          sites are returned.
+            all_versions- Unless specified, just the latest version of motifs
+                          determined by the other selection criteria are
+                          returned. Otherwise all versions of the selected
+                          motifs are returned.
+
+        Returns:
+            - A Bio.motifs.jaspar.Record (list) of motifs.
+
+        """
+        # Fetch the internal IDs of the motifs using the criteria provided
+        int_ids = self._fetch_internal_id_list(
+            collection=collection,
+            tf_name=tf_name,
+            tf_class=tf_class,
+            tf_family=tf_family,
+            matrix_id=matrix_id,
+            tax_group=tax_group,
+            species=species,
+            pazar_id=pazar_id,
+            data_type=data_type,
+            medline=medline,
+            all=all,
+            all_versions=all_versions,
+        )
+
+        record = jaspar.Record()
+
+        """
+        Now further filter motifs returned above based on any specified
+        matrix specific criteria.
+        """
+        for int_id in int_ids:
+            motif = self._fetch_motif_by_internal_id(int_id)
+
+            # Filter motifs to those with matrix IC greater than min_ic
+            if min_ic:
+                if motif.pssm.mean() < min_ic:
+                    continue
+
+            # Filter motifs to those with minimum length of min_length
+            if min_length:
+                if motif.length < min_length:
+                    continue
+
+            # XXX We could also supply a max_length filter.
+
+            """
+            Filter motifs to those composed of at least this many sites.
+            The perl TFBS module assumes column sums may be different but
+            this should be strictly enforced here we will ignore this and
+            just use the first column sum.
+            """
+            if min_sites:
+                num_sites = sum(motif.counts[nt][0] for nt in motif.alphabet)
+                if num_sites < min_sites:
+                    continue
+
+            record.append(motif)
+
+        return record
+
+    def _fetch_latest_version(self, base_id):
+        """Get the latest version number for the given base_id (PRIVATE)."""
+        cur = self.dbh.cursor()
+        cur.execute(
+            "select VERSION from MATRIX where BASE_id = %s order by VERSION"
+            " desc limit 1",
+            (base_id,),
+        )
+
+        row = cur.fetchone()
+
+        latest = None
+        if row:
+            latest = row[0]
+        else:
+            warnings.warn(
+                "Failed to fetch latest version number for JASPAR motif"
+                f" with base ID '{base_id}'. No JASPAR motif with this"
+                " base ID appears to exist in the database.",
+                BiopythonWarning,
+            )
+
+        return latest
+
+    def _fetch_internal_id(self, base_id, version):
+        """Fetch the internal id for a base id + version (PRIVATE).
+
+        Also checks if this combo exists or not.
+        """
+        cur = self.dbh.cursor()
+        cur.execute(
+            "select id from MATRIX where BASE_id = %s and VERSION = %s",
+            (base_id, version),
+        )
+
+        row = cur.fetchone()
+
+        int_id = None
+        if row:
+            int_id = row[0]
+        else:
+            warnings.warn(
+                "Failed to fetch internal database ID for JASPAR motif"
+                f" with matrix ID '{base_id}.{version}'. No JASPAR motif"
+                " with this matrix ID appears to exist.",
+                BiopythonWarning,
+            )
+
+        return int_id
+
+    def _fetch_motif_by_internal_id(self, int_id):
+        """Fetch basic motif information (PRIVATE)."""
+        cur = self.dbh.cursor()
+        cur.execute(
+            "select BASE_ID, VERSION, COLLECTION, NAME from MATRIX where id = %s",
+            (int_id,),
+        )
+
+        row = cur.fetchone()
+
+        # This should never happen as it is an internal method. If it does
+        # we should probably raise an exception
+        if not row:
+            warnings.warn(
+                f"Could not fetch JASPAR motif with internal ID = {int_id}",
+                BiopythonWarning,
+            )
+            return None
+
+        base_id = row[0]
+        version = row[1]
+        collection = row[2]
+        name = row[3]
+
+        matrix_id = "".join([base_id, ".", str(version)])
+
+        # fetch the counts matrix
+        counts = self._fetch_counts_matrix(int_id)
+
+        # Create new JASPAR motif
+        motif = jaspar.Motif(matrix_id, name, collection=collection, counts=counts)
+
+        # fetch species
+        cur.execute("select TAX_ID from MATRIX_SPECIES where id = %s", (int_id,))
+        tax_ids = []
+        rows = cur.fetchall()
+        for row in rows:
+            tax_ids.append(row[0])
+
+        # Many JASPAR motifs (especially those not in the CORE collection)
+        # do not have taxonomy IDs. So this warning would get annoying.
+        # if not tax_ids:
+        #     warnings.warn("Could not fetch any taxonomy IDs for JASPAR motif"
+        #                   " {0}".format(motif.matrix_id), BiopythonWarning)
+
+        motif.species = tax_ids
+
+        # fetch protein accession numbers
+        cur.execute("select ACC FROM MATRIX_PROTEIN where id = %s", (int_id,))
+        accs = []
+        rows = cur.fetchall()
+        for row in rows:
+            accs.append(row[0])
+
+        # Similarly as for taxonomy IDs, it would get annoying to print
+        # warnings for JASPAR motifs which do not have accession numbers.
+
+        motif.acc = accs
+
+        # fetch remaining annotation as tags from the ANNOTATION table
+        cur.execute("select TAG, VAL from MATRIX_ANNOTATION where id = %s", (int_id,))
+        rows = cur.fetchall()
+        for row in rows:
+            attr = row[0]
+            val = row[1]
+            if attr == "class":
+                motif.tf_class = val
+            elif attr == "family":
+                motif.tf_family = val
+            elif attr == "tax_group":
+                motif.tax_group = val
+            elif attr == "type":
+                motif.data_type = val
+            elif attr == "pazar_tf_id":
+                motif.pazar_id = val
+            elif attr == "medline":
+                motif.medline = val
+            elif attr == "comment":
+                motif.comment = val
+            else:
+                # TODO If we were to implement additional abitrary tags
+                # motif.tag(attr, val)
+                pass
+
+        return motif
+
+    def _fetch_counts_matrix(self, int_id):
+        """Fetch the counts matrix from the JASPAR DB by the internal ID (PRIVATE).
+
+        Returns a Bio.motifs.matrix.GenericPositionMatrix
+        """
+        counts = {}
+        cur = self.dbh.cursor()
+
+        for base in "ACGT":
+            base_counts = []
+
+            cur.execute(
+                "select val from MATRIX_DATA where ID = %s and row = %s order by col",
+                (int_id, base),
+            )
+
+            rows = cur.fetchall()
+            for row in rows:
+                base_counts.append(row[0])
+
+            counts[base] = [float(x) for x in base_counts]
+
+        return matrix.GenericPositionMatrix("ACGT", counts)
+
+    def _fetch_internal_id_list(
+        self,
+        collection=JASPAR_DFLT_COLLECTION,
+        tf_name=None,
+        tf_class=None,
+        tf_family=None,
+        matrix_id=None,
+        tax_group=None,
+        species=None,
+        pazar_id=None,
+        data_type=None,
+        medline=None,
+        all=False,
+        all_versions=False,
+    ):
+        """Fetch list of internal JASPAR motif IDs.
+
+        Fetch a list of internal JASPAR motif IDs based on various passed
+        parameters which may then be used to fetch the rest of the motif data.
+
+        Caller:
+            fetch_motifs()
+
+        Arguments:
+            See arguments sections of fetch_motifs()
+
+        Returns:
+            A list of internal JASPAR motif IDs which match the given
+            selection criteria arguments.
+
+
+        Build an SQL query based on the selection arguments provided.
+
+        1: First add table joins and sub-clauses for criteria corresponding to
+           named fields from the MATRIX and MATRIX_SPECIES tables such as
+           collection, matrix ID, name, species etc.
+
+        2: Then add joins/sub-clauses for tag/value parameters from the
+           MATRIX_ANNOTATION table.
+
+        For the surviving matrices, the responsibility to do matrix-based
+        feature filtering such as ic, number of sites etc, fall on the
+        calling fetch_motifs() method.
+
+        """
+        int_ids = []
+
+        cur = self.dbh.cursor()
+
+        """
+        Special case 1: fetch ALL motifs. Highest priority.
+        Ignore all other selection arguments.
+        """
+        if all:
+            cur.execute("select ID from MATRIX")
+            rows = cur.fetchall()
+
+            for row in rows:
+                int_ids.append(row[0])
+
+            return int_ids
+
+        """
+        Special case 2: fetch specific motifs by their JASPAR IDs. This
+        has higher priority than any other except the above 'all' case.
+        Ignore all other selection arguments.
+        """
+        if matrix_id:
+            """
+            These might be either stable IDs or stable_ID.version.
+            If just stable ID and if all_versions == 1, return all versions,
+            otherwise just the latest
+            """
+            if all_versions:
+                for id in matrix_id:
+                    # ignore vesion here, this is a stupidity filter
+                    (base_id, version) = jaspar.split_jaspar_id(id)
+                    cur.execute("select ID from MATRIX where BASE_ID = %s", (base_id,))
+
+                    rows = cur.fetchall()
+                    for row in rows:
+                        int_ids.append(row[0])
+            else:
+                # only the lastest version, or the requested version
+                for id in matrix_id:
+                    (base_id, version) = jaspar.split_jaspar_id(id)
+
+                    if not version:
+                        version = self._fetch_latest_version(base_id)
+
+                    int_id = None
+                    if version:
+                        int_id = self._fetch_internal_id(base_id, version)
+
+                    if int_id:
+                        int_ids.append(int_id)
+
+            return int_ids
+
+        tables = ["MATRIX m"]
+        where_clauses = []
+
+        # Select by MATRIX.COLLECTION
+        if collection:
+            if isinstance(collection, list):
+                # Multiple collections passed in as a list
+                clause = "m.COLLECTION in ('"
+                clause = "".join([clause, "','".join(collection)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single collection - typical usage
+                clause = "m.COLLECTION = '%s'" % collection
+
+            where_clauses.append(clause)
+
+        # Select by MATRIX.NAME
+        if tf_name:
+            if isinstance(tf_name, list):
+                # Multiple names passed in as a list
+                clause = "m.NAME in ('"
+                clause = "".join([clause, "','".join(tf_name)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single name
+                clause = "m.NAME = '%s'" % tf_name
+
+            where_clauses.append(clause)
+
+        # Select by MATRIX_SPECIES.TAX_ID
+        if species:
+            tables.append("MATRIX_SPECIES ms")
+            where_clauses.append("m.ID = ms.ID")
+
+            """
+            NOTE: species are numeric taxonomy IDs but stored as varchars
+            in the DB.
+            """
+            if isinstance(species, list):
+                # Multiple tax IDs passed in as a list
+                clause = "ms.TAX_ID in ('"
+                clause = "".join([clause, "','".join(str(s) for s in species)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single tax ID
+                clause = "ms.TAX_ID = '%s'" % species
+
+            where_clauses.append(clause)
+
+        """
+        Tag based selection from MATRIX_ANNOTATION
+        Differs from perl TFBS module in that the matrix class explicitly
+        has a tag attribute corresponding to the tags in the database. This
+        provides tremendous flexibility in adding new tags to the DB and
+        being able to select based on those tags with out adding new code.
+        In the JASPAR Motif class we have elected to use specific attributes
+        for the most commonly used tags and here correspondingly only allow
+        selection on these attributes.
+
+        The attributes corresponding to the tags for which selection is
+        provided are:
+
+           Attribute   Tag
+           tf_class    class
+           tf_family   family
+           pazar_id    pazar_tf_id
+           medline     medline
+           data_type   type
+           tax_group   tax_group
+        """
+
+        # Select by TF class(es) (MATRIX_ANNOTATION.TAG="class")
+        if tf_class:
+            tables.append("MATRIX_ANNOTATION ma1")
+            where_clauses.append("m.ID = ma1.ID")
+
+            clause = "ma1.TAG = 'class'"
+            if isinstance(tf_class, list):
+                # A list of TF classes
+                clause = "".join([clause, " and ma1.VAL in ('"])
+                clause = "".join([clause, "','".join(tf_class)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single TF class
+                clause = "".join([clause, " and ma1.VAL = '%s' " % tf_class])
+
+            where_clauses.append(clause)
+
+        # Select by TF families (MATRIX_ANNOTATION.TAG="family")
+        if tf_family:
+            tables.append("MATRIX_ANNOTATION ma2")
+            where_clauses.append("m.ID = ma2.ID")
+
+            clause = "ma2.TAG = 'family'"
+            if isinstance(tf_family, list):
+                # A list of TF families
+                clause = "".join([clause, " and ma2.VAL in ('"])
+                clause = "".join([clause, "','".join(tf_family)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single TF family
+                clause = "".join([clause, " and ma2.VAL = '%s' " % tf_family])
+
+            where_clauses.append(clause)
+
+        # Select by PAZAR TF ID(s) (MATRIX_ANNOTATION.TAG="pazar_tf_id")
+        if pazar_id:
+            tables.append("MATRIX_ANNOTATION ma3")
+            where_clauses.append("m.ID = ma3.ID")
+
+            clause = "ma3.TAG = 'pazar_tf_id'"
+            if isinstance(pazar_id, list):
+                # A list of PAZAR IDs
+                clause = "".join([clause, " and ma3.VAL in ('"])
+                clause = "".join([clause, "','".join(pazar_id)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single PAZAR ID
+                clause = "".join([" and ma3.VAL = '%s' " % pazar_id])
+
+            where_clauses.append(clause)
+
+        # Select by PubMed ID(s) (MATRIX_ANNOTATION.TAG="medline")
+        if medline:
+            tables.append("MATRIX_ANNOTATION ma4")
+            where_clauses.append("m.ID = ma4.ID")
+
+            clause = "ma4.TAG = 'medline'"
+            if isinstance(medline, list):
+                # A list of PubMed IDs
+                clause = "".join([clause, " and ma4.VAL in ('"])
+                clause = "".join([clause, "','".join(medline)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single PubMed ID
+                clause = "".join([" and ma4.VAL = '%s' " % medline])
+
+            where_clauses.append(clause)
+
+        # Select by data type(s) used to compile the matrix
+        # (MATRIX_ANNOTATION.TAG="type")
+        if data_type:
+            tables.append("MATRIX_ANNOTATION ma5")
+            where_clauses.append("m.ID = ma5.ID")
+
+            clause = "ma5.TAG = 'type'"
+            if isinstance(data_type, list):
+                # A list of data types
+                clause = "".join([clause, " and ma5.VAL in ('"])
+                clause = "".join([clause, "','".join(data_type)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single data type
+                clause = "".join([" and ma5.VAL = '%s' " % data_type])
+
+            where_clauses.append(clause)
+
+        # Select by taxonomic supergroup(s) (MATRIX_ANNOTATION.TAG="tax_group")
+        if tax_group:
+            tables.append("MATRIX_ANNOTATION ma6")
+            where_clauses.append("m.ID = ma6.ID")
+
+            clause = "ma6.TAG = 'tax_group'"
+            if isinstance(tax_group, list):
+                # A list of tax IDs
+                clause = "".join([clause, " and ma6.VAL in ('"])
+                clause = "".join([clause, "','".join(tax_group)])
+                clause = "".join([clause, "')"])
+            else:
+                # A single tax ID
+                clause = "".join([clause, " and ma6.VAL = '%s' " % tax_group])
+
+            where_clauses.append(clause)
+
+        sql = "".join(["select distinct(m.ID) from ", ", ".join(tables)])
+
+        if where_clauses:
+            sql = "".join([sql, " where ", " and ".join(where_clauses)])
+
+        # print("sql = %s" % sql)
+
+        cur.execute(sql)
+        rows = cur.fetchall()
+
+        for row in rows:
+            id = row[0]
+            if all_versions:
+                int_ids.append(id)
+            else:
+                # is the latest version?
+                if self._is_latest_version(id):
+                    int_ids.append(id)
+
+        if len(int_ids) < 1:
+            warnings.warn(
+                "Zero motifs returned with current select critera", BiopythonWarning
+            )
+
+        return int_ids
+
+    def _is_latest_version(self, int_id):
+        """Check if the internal ID represents the latest JASPAR matrix (PRIVATE).
+
+        Does this internal ID represent the latest version of the JASPAR
+        matrix (collapse on base ids)
+        """
+        cur = self.dbh.cursor()
+
+        cur.execute(
+            "select count(*) from MATRIX where "
+            "BASE_ID = (select BASE_ID from MATRIX where ID = %s) "
+            "and VERSION > (select VERSION from MATRIX where ID = %s)",
+            (int_id, int_id),
+        )
+
+        row = cur.fetchone()
+
+        count = row[0]
+
+        if count == 0:
+            # no matrices with higher version ID and same base id
+            return True
+
+        return False
diff --git a/code/lib/Bio/motifs/mast.py b/code/lib/Bio/motifs/mast.py
new file mode 100644
index 0000000..face5b9
--- /dev/null
+++ b/code/lib/Bio/motifs/mast.py
@@ -0,0 +1,133 @@
+# Copyright 2008 by Bartek Wilczynski.
+# Adapted from Bio.MEME.Parser by Jason A. Hackney.  All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+"""Module for the support of Motif Alignment and Search Tool (MAST)."""
+
+import xml.etree.ElementTree as ET
+
+from Bio.motifs import meme
+
+
+class Record(list):
+    """The class for holding the results from a MAST run.
+
+    A mast.Record holds data about matches between motifs and sequences.
+    The motifs held by the Record are objects of the class meme.Motif.
+
+    The mast.Record class inherits from list, so you can access individual
+    motifs in the record by their index. Alternatively, you can find a motif
+    by its name:
+
+    >>> from Bio import motifs
+    >>> with open("motifs/mast.crp0.de.oops.txt.xml") as f:
+    ...     record = motifs.parse(f, 'MAST')
+    >>> motif = record[0]
+    >>> print(motif.name)
+    1
+    >>> motif = record['1']
+    >>> print(motif.name)
+    1
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.sequences = []
+        self.version = ""
+        self.database = ""
+        self.diagrams = {}
+        self.alphabet = None
+        self.strand_handling = ""
+
+    def __getitem__(self, key):
+        """Return the motif of index key."""
+        if isinstance(key, str):
+            for motif in self:
+                if motif.name == key:
+                    return motif
+        else:
+            return list.__getitem__(self, key)
+
+
+def read(handle):
+    """Parse a MAST XML format handle as a Record object."""
+    record = Record()
+    try:
+        xml_tree = ET.parse(handle)
+    except ET.ParseError:
+        raise ValueError(
+            "Improper MAST XML input file. XML root tag should start with  maximum:
+                    maximum = count
+                    sequence_letter = letter
+            sequence += sequence_letter
+        return Seq(sequence)
+
+    @property
+    def anticonsensus(self):
+        """Return the anticonsensus sequence."""
+        sequence = ""
+        for i in range(self.length):
+            minimum = math.inf
+            for letter in self.alphabet:
+                count = self[letter][i]
+                if count < minimum:
+                    minimum = count
+                    sequence_letter = letter
+            sequence += sequence_letter
+        return Seq(sequence)
+
+    @property
+    def degenerate_consensus(self):
+        """Return the degenerate consensus sequence."""
+        # Following the rules adapted from
+        # D. R. Cavener: "Comparison of the consensus sequence flanking
+        # translational start sites in Drosophila and vertebrates."
+        # Nucleic Acids Research 15(4): 1353-1361. (1987).
+        # The same rules are used by TRANSFAC.
+        degenerate_nucleotide = {
+            "A": "A",
+            "C": "C",
+            "G": "G",
+            "T": "T",
+            "AC": "M",
+            "AG": "R",
+            "AT": "W",
+            "CG": "S",
+            "CT": "Y",
+            "GT": "K",
+            "ACG": "V",
+            "ACT": "H",
+            "AGT": "D",
+            "CGT": "B",
+            "ACGT": "N",
+        }
+        sequence = ""
+        for i in range(self.length):
+
+            def get(nucleotide):
+                return self[nucleotide][i]
+
+            nucleotides = sorted(self, key=get, reverse=True)
+            counts = [self[c][i] for c in nucleotides]
+            # Follow the Cavener rules:
+            if counts[0] > sum(counts[1:]) and counts[0] > 2 * counts[1]:
+                key = nucleotides[0]
+            elif 4 * sum(counts[:2]) > 3 * sum(counts):
+                key = "".join(sorted(nucleotides[:2]))
+            elif counts[3] == 0:
+                key = "".join(sorted(nucleotides[:3]))
+            else:
+                key = "ACGT"
+            nucleotide = degenerate_nucleotide.get(key, key)
+            sequence += nucleotide
+        return Seq(sequence)
+
+    @property
+    def gc_content(self):
+        """Compute the fraction GC content."""
+        alphabet = self.alphabet
+        gc_total = 0.0
+        total = 0.0
+        for i in range(self.length):
+            for letter in alphabet:
+                if letter in "CG":
+                    gc_total += self[letter][i]
+                total += self[letter][i]
+        return gc_total / total
+
+    def reverse_complement(self):
+        """Compute reverse complement."""
+        values = {}
+        if self.alphabet == "ACGU":
+            values["A"] = self["U"][::-1]
+            values["U"] = self["A"][::-1]
+        else:
+            values["A"] = self["T"][::-1]
+            values["T"] = self["A"][::-1]
+        values["G"] = self["C"][::-1]
+        values["C"] = self["G"][::-1]
+        alphabet = self.alphabet
+        return self.__class__(alphabet, values)
+
+
+class FrequencyPositionMatrix(GenericPositionMatrix):
+    """Class for the support of frequency calculations on the Position Matrix."""
+
+    def normalize(self, pseudocounts=None):
+        """Create and return a position-weight matrix by normalizing the counts matrix.
+
+        If pseudocounts is None (default), no pseudocounts are added
+        to the counts.
+
+        If pseudocounts is a number, it is added to the counts before
+        calculating the position-weight matrix.
+
+        Alternatively, the pseudocounts can be a dictionary with a key
+        for each letter in the alphabet associated with the motif.
+        """
+        counts = {}
+        if pseudocounts is None:
+            for letter in self.alphabet:
+                counts[letter] = [0.0] * self.length
+        elif isinstance(pseudocounts, dict):
+            for letter in self.alphabet:
+                counts[letter] = [float(pseudocounts[letter])] * self.length
+        else:
+            for letter in self.alphabet:
+                counts[letter] = [float(pseudocounts)] * self.length
+        for i in range(self.length):
+            for letter in self.alphabet:
+                counts[letter][i] += self[letter][i]
+        # Actual normalization is done in the PositionWeightMatrix initializer
+        return PositionWeightMatrix(self.alphabet, counts)
+
+
+class PositionWeightMatrix(GenericPositionMatrix):
+    """Class for the support of weight calculations on the Position Matrix."""
+
+    def __init__(self, alphabet, counts):
+        """Initialize the class."""
+        GenericPositionMatrix.__init__(self, alphabet, counts)
+        for i in range(self.length):
+            total = sum(float(self[letter][i]) for letter in alphabet)
+            for letter in alphabet:
+                self[letter][i] /= total
+        for letter in alphabet:
+            self[letter] = tuple(self[letter])
+
+    def log_odds(self, background=None):
+        """Return the Position-Specific Scoring Matrix.
+
+        The Position-Specific Scoring Matrix (PSSM) contains the log-odds
+        scores computed from the probability matrix and the background
+        probabilities. If the background is None, a uniform background
+        distribution is assumed.
+        """
+        values = {}
+        alphabet = self.alphabet
+        if background is None:
+            background = dict.fromkeys(self.alphabet, 1.0)
+        else:
+            background = dict(background)
+        total = sum(background.values())
+        for letter in alphabet:
+            background[letter] /= total
+            values[letter] = []
+        for i in range(self.length):
+            for letter in alphabet:
+                b = background[letter]
+                if b > 0:
+                    p = self[letter][i]
+                    if p > 0:
+                        logodds = math.log(p / b, 2)
+                    else:
+                        logodds = -math.inf
+                else:
+                    p = self[letter][i]
+                    if p > 0:
+                        logodds = math.inf
+                    else:
+                        logodds = math.nan
+                values[letter].append(logodds)
+        pssm = PositionSpecificScoringMatrix(alphabet, values)
+        return pssm
+
+
+class PositionSpecificScoringMatrix(GenericPositionMatrix):
+    """Class for the support of Position Specific Scoring Matrix calculations."""
+
+    def calculate(self, sequence):
+        """Return the PWM score for a given sequence for all positions.
+
+        Notes:
+         - the sequence can only be a DNA sequence
+         - the search is performed only on one strand
+         - if the sequence and the motif have the same length, a single
+           number is returned
+         - otherwise, the result is a one-dimensional numpy array
+
+        """
+        # TODO - Code itself tolerates ambiguous bases (as NaN).
+        if sorted(self.alphabet) != ["A", "C", "G", "T"]:
+            raise ValueError(
+                "PSSM has wrong alphabet: %s - Use only with DNA motifs" % self.alphabet
+            )
+
+        # NOTE: The C code handles mixed case input as this could be large
+        # (e.g. contig or chromosome), so requiring it be all upper or lower
+        # case would impose an overhead to allocate the extra memory.
+        try:
+            sequence = bytes(sequence)
+        except TypeError:  # str
+            try:
+                sequence = bytes(sequence, "ASCII")
+            except TypeError:
+                raise ValueError(
+                    "sequence should be a Seq, MutableSeq, string, or bytes-like object"
+                ) from None
+            except UnicodeEncodeError:
+                raise ValueError(
+                    "sequence should contain ASCII characters only"
+                ) from None
+        except Exception:
+            raise ValueError(
+                "sequence should be a Seq, MutableSeq, string, or bytes-like object"
+            ) from None
+
+        n = len(sequence)
+        m = self.length
+        # Create the numpy arrays here; the C module then does not rely on numpy
+        # Use a float32 for the scores array to save space
+        scores = np.empty(n - m + 1, np.float32)
+        logodds = np.array(
+            [[self[letter][i] for letter in "ACGT"] for i in range(m)], float
+        )
+        _pwm.calculate(sequence, logodds, scores)
+
+        if len(scores) == 1:
+            return scores[0]
+        else:
+            return scores
+
+    def search(self, sequence, threshold=0.0, both=True, chunksize=10 ** 6):
+        """Find hits with PWM score above given threshold.
+
+        A generator function, returning found hits in the given sequence
+        with the pwm score higher than the threshold.
+        """
+        sequence = sequence.upper()
+        seq_len = len(sequence)
+        motif_l = self.length
+        chunk_starts = np.arange(0, seq_len, chunksize)
+        if both:
+            rc = self.reverse_complement()
+        for chunk_start in chunk_starts:
+            subseq = sequence[chunk_start : chunk_start + chunksize + motif_l - 1]
+            pos_scores = self.calculate(subseq)
+            pos_ind = pos_scores >= threshold
+            pos_positions = np.where(pos_ind)[0] + chunk_start
+            pos_scores = pos_scores[pos_ind]
+            if both:
+                neg_scores = rc.calculate(subseq)
+                neg_ind = neg_scores >= threshold
+                neg_positions = np.where(neg_ind)[0] + chunk_start
+                neg_scores = neg_scores[neg_ind]
+            else:
+                neg_positions = np.empty((0), dtype=int)
+                neg_scores = np.empty((0), dtype=int)
+            chunk_positions = np.append(pos_positions, neg_positions - seq_len)
+            chunk_scores = np.append(pos_scores, neg_scores)
+            order = np.argsort(np.append(pos_positions, neg_positions))
+            chunk_positions = chunk_positions[order]
+            chunk_scores = chunk_scores[order]
+            yield from zip(chunk_positions, chunk_scores)
+
+    @property
+    def max(self):
+        """Maximal possible score for this motif.
+
+        returns the score computed for the consensus sequence.
+        """
+        score = 0.0
+        letters = self.alphabet
+        for position in range(0, self.length):
+            score += max(self[letter][position] for letter in letters)
+        return score
+
+    @property
+    def min(self):
+        """Minimal possible score for this motif.
+
+        returns the score computed for the anticonsensus sequence.
+        """
+        score = 0.0
+        letters = self.alphabet
+        for position in range(0, self.length):
+            score += min(self[letter][position] for letter in letters)
+        return score
+
+    @property
+    def gc_content(self):
+        """Compute the GC-ratio."""
+        raise Exception("Cannot compute the %GC composition of a PSSM")
+
+    def mean(self, background=None):
+        """Return expected value of the score of a motif."""
+        if background is None:
+            background = dict.fromkeys(self.alphabet, 1.0)
+        else:
+            background = dict(background)
+        total = sum(background.values())
+        for letter in self.alphabet:
+            background[letter] /= total
+        sx = 0.0
+        for i in range(self.length):
+            for letter in self.alphabet:
+                logodds = self[letter, i]
+                if math.isnan(logodds):
+                    continue
+                if math.isinf(logodds) and logodds < 0:
+                    continue
+                b = background[letter]
+                p = b * math.pow(2, logodds)
+                sx += p * logodds
+        return sx
+
+    def std(self, background=None):
+        """Return standard deviation of the score of a motif."""
+        if background is None:
+            background = dict.fromkeys(self.alphabet, 1.0)
+        else:
+            background = dict(background)
+        total = sum(background.values())
+        for letter in self.alphabet:
+            background[letter] /= total
+        variance = 0.0
+        for i in range(self.length):
+            sx = 0.0
+            sxx = 0.0
+            for letter in self.alphabet:
+                logodds = self[letter, i]
+                if math.isnan(logodds):
+                    continue
+                if math.isinf(logodds) and logodds < 0:
+                    continue
+                b = background[letter]
+                p = b * math.pow(2, logodds)
+                sx += p * logodds
+                sxx += p * logodds * logodds
+            sxx -= sx * sx
+            variance += sxx
+        variance = max(variance, 0)  # to avoid roundoff problems
+        return math.sqrt(variance)
+
+    def dist_pearson(self, other):
+        """Return the similarity score based on pearson correlation for the given motif against self.
+
+        We use the Pearson's correlation of the respective probabilities.
+        """
+        if self.alphabet != other.alphabet:
+            raise ValueError("Cannot compare motifs with different alphabets")
+
+        max_p = -2
+        for offset in range(-self.length + 1, other.length):
+            if offset < 0:
+                p = self.dist_pearson_at(other, -offset)
+            else:  # offset>=0
+                p = other.dist_pearson_at(self, offset)
+            if max_p < p:
+                max_p = p
+                max_o = -offset
+        return 1 - max_p, max_o
+
+    def dist_pearson_at(self, other, offset):
+        """Return the similarity score based on pearson correlation at the given offset."""
+        letters = self.alphabet
+        sx = 0.0  # \sum x
+        sy = 0.0  # \sum y
+        sxx = 0.0  # \sum x^2
+        sxy = 0.0  # \sum x \cdot y
+        syy = 0.0  # \sum y^2
+        norm = max(self.length, offset + other.length) * len(letters)
+        for pos in range(min(self.length - offset, other.length)):
+            xi = [self[letter, pos + offset] for letter in letters]
+            yi = [other[letter, pos] for letter in letters]
+            sx += sum(xi)
+            sy += sum(yi)
+            sxx += sum(x * x for x in xi)
+            sxy += sum(x * y for x, y in zip(xi, yi))
+            syy += sum(y * y for y in yi)
+        sx /= norm
+        sy /= norm
+        sxx /= norm
+        sxy /= norm
+        syy /= norm
+        numerator = sxy - sx * sy
+        denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy))
+        return numerator / denominator
+
+    def distribution(self, background=None, precision=10 ** 3):
+        """Calculate the distribution of the scores at the given precision."""
+        from .thresholds import ScoreDistribution
+
+        if background is None:
+            background = dict.fromkeys(self.alphabet, 1.0)
+        else:
+            background = dict(background)
+        total = sum(background.values())
+        for letter in self.alphabet:
+            background[letter] /= total
+        return ScoreDistribution(precision=precision, pssm=self, background=background)
diff --git a/code/lib/Bio/motifs/meme.py b/code/lib/Bio/motifs/meme.py
new file mode 100644
index 0000000..25ee0b4
--- /dev/null
+++ b/code/lib/Bio/motifs/meme.py
@@ -0,0 +1,195 @@
+# Copyright 2008 by Bartek Wilczynski.
+# Revisions copyright 2019 by Victor Lin.
+# Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+"""Module for the support of MEME motif format."""
+
+import xml.etree.ElementTree as ET
+
+from Bio import Seq
+from Bio import motifs
+
+
+def read(handle):
+    """Parse the text output of the MEME program into a meme.Record object.
+
+    Examples
+    --------
+    >>> from Bio.motifs import meme
+    >>> with open("motifs/meme.INO_up800.classic.oops.xml") as f:
+    ...     record = meme.read(f)
+    >>> for motif in record:
+    ...     for instance in motif.instances:
+    ...         print(instance.motif_name, instance.sequence_name, instance.sequence_id, instance.strand, instance.pvalue)
+    GSKGCATGTGAAA INO1 sequence_5 + 1.21e-08
+    GSKGCATGTGAAA FAS1 sequence_2 - 1.87e-08
+    GSKGCATGTGAAA ACC1 sequence_4 - 6.62e-08
+    GSKGCATGTGAAA CHO2 sequence_1 - 1.05e-07
+    GSKGCATGTGAAA CHO1 sequence_0 - 1.69e-07
+    GSKGCATGTGAAA FAS2 sequence_3 - 5.62e-07
+    GSKGCATGTGAAA OPI3 sequence_6 + 1.08e-06
+    TTGACWCYTGCYCWG CHO2 sequence_1 + 7.2e-10
+    TTGACWCYTGCYCWG OPI3 sequence_6 - 2.56e-08
+    TTGACWCYTGCYCWG ACC1 sequence_4 - 1.59e-07
+    TTGACWCYTGCYCWG CHO1 sequence_0 + 2.05e-07
+    TTGACWCYTGCYCWG FAS1 sequence_2 + 3.85e-07
+    TTGACWCYTGCYCWG FAS2 sequence_3 - 5.11e-07
+    TTGACWCYTGCYCWG INO1 sequence_5 + 8.01e-07
+
+    """
+    record = Record()
+    try:
+        xml_tree = ET.parse(handle)
+    except ET.ParseError:
+        raise ValueError(
+            "Improper MEME XML input file. XML root tag should start with >> from Bio import motifs
+    >>> with open("motifs/meme.INO_up800.classic.oops.xml") as f:
+    ...     record = motifs.parse(f, 'MEME')
+    >>> motif = record[0]
+    >>> print(motif.name)
+    GSKGCATGTGAAA
+    >>> motif = record['GSKGCATGTGAAA']
+    >>> print(motif.name)
+    GSKGCATGTGAAA
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.version = ""
+        self.datafile = ""
+        self.command = ""
+        self.alphabet = ""
+        self.sequences = []
+
+    def __getitem__(self, key):
+        """Return the motif of index key."""
+        if isinstance(key, str):
+            for motif in self:
+                if motif.name == key:
+                    return motif
+        else:
+            return list.__getitem__(self, key)
+
+
+# Everything below is private
+
+
+def __read_metadata(record, xml_tree):
+    record.version = xml_tree.getroot().get("version")
+    record.datafile = xml_tree.find("training_set").get("primary_sequences")
+    record.command = xml_tree.find("model").find("command_line").text
+    # TODO - background_frequencies, other metadata under model
+
+
+def __read_alphabet(record, xml_tree):
+    alphabet_tree = (
+        xml_tree.find("training_set").find("letter_frequencies").find("alphabet_array")
+    )
+    for value in alphabet_tree.findall("value"):
+        record.alphabet += value.get("letter_id")
+
+
+def __get_sequence_id_name_map(xml_tree):
+    return {
+        sequence_tree.get("id"): sequence_tree.get("name")
+        for sequence_tree in xml_tree.find("training_set").findall("sequence")
+    }
+
+
+def __read_motifs(record, xml_tree, sequence_id_name_map):
+    for motif_tree in xml_tree.find("motifs").findall("motif"):
+        instances = []
+        for site_tree in motif_tree.find("contributing_sites").findall(
+            "contributing_site"
+        ):
+            letters = [
+                letter_ref.get("letter_id")
+                for letter_ref in site_tree.find("site").findall("letter_ref")
+            ]
+            sequence = "".join(letters)
+            instance = Instance(sequence)
+            instance.motif_name = motif_tree.get("name")
+            instance.sequence_id = site_tree.get("sequence_id")
+            instance.sequence_name = sequence_id_name_map[instance.sequence_id]
+            # TODO - left flank, right flank
+            instance.start = int(site_tree.get("position")) + 1
+            instance.pvalue = float(site_tree.get("pvalue"))
+            instance.strand = __convert_strand(site_tree.get("strand"))
+            instance.length = len(sequence)
+            instances.append(instance)
+        instances = motifs.Instances(instances, record.alphabet)
+        motif = Motif(record.alphabet, instances)
+        motif.id = motif_tree.get("id")
+        motif.name = motif_tree.get("name")
+        motif.alt_id = motif_tree.get("alt")
+        motif.length = int(motif_tree.get("width"))
+        motif.num_occurrences = int(motif_tree.get("sites"))
+        motif.evalue = float(motif_tree.get("e_value"))
+        # TODO - ic, re, llr, pvalue, bayes_threshold, elapsed_time
+        record.append(motif)
+
+
+def __convert_strand(strand):
+    """Convert strand (+/-) from XML if present.
+
+    Default: +
+    """
+    if strand == "minus":
+        return "-"
+    if strand == "plus" or strand == "none":
+        return "+"
diff --git a/code/lib/Bio/motifs/minimal.py b/code/lib/Bio/motifs/minimal.py
new file mode 100644
index 0000000..bdf7e4c
--- /dev/null
+++ b/code/lib/Bio/motifs/minimal.py
@@ -0,0 +1,193 @@
+# Copyright 2018 by Ariel Aptekmann.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Module for the support of MEME minimal motif format."""
+
+from Bio import motifs
+
+
+def read(handle):
+    """Parse the text output of the MEME program into a meme.Record object.
+
+    Examples
+    --------
+    >>> from Bio.motifs import minimal
+    >>> with open("motifs/meme.out") as f:
+    ...     record = minimal.read(f)
+    ...
+    >>> for motif in record:
+    ...     print(motif.name, motif.evalue)
+    ...
+    1 1.1e-22
+
+    You can access individual motifs in the record by their index or find a motif
+    by its name:
+
+    >>> from Bio import motifs
+    >>> with open("motifs/minimal_test.meme") as f:
+    ...     record = motifs.parse(f, 'minimal')
+    ...
+    >>> motif = record[0]
+    >>> print(motif.name)
+    KRP
+    >>> motif = record['IFXA']
+    >>> print(motif.name)
+    IFXA
+
+    This function wont retrieve instances, as there are none in minimal meme format.
+
+    """
+    motif_number = 0
+    record = Record()
+    _read_version(record, handle)
+    _read_alphabet(record, handle)
+    _read_background(record, handle)
+
+    while True:
+        for line in handle:
+            if line.startswith("MOTIF"):
+                break
+        else:
+            return record
+        name = line.split()[1]
+        motif_number += 1
+        length, num_occurrences, evalue = _read_motif_statistics(handle)
+        counts = _read_lpm(handle, num_occurrences)
+        # {'A': 0.25, 'C': 0.25, 'T': 0.25, 'G': 0.25}
+        motif = motifs.Motif(alphabet=record.alphabet, counts=counts)
+        motif.background = record.background
+        motif.length = length
+        motif.num_occurrences = num_occurrences
+        motif.evalue = evalue
+        motif.name = name
+        record.append(motif)
+        assert len(record) == motif_number
+    return record
+
+
+class Record(list):
+    """Class for holding the results of a minimal MEME run."""
+
+    def __init__(self):
+        """Initialize record class values."""
+        self.version = ""
+        self.datafile = ""
+        self.command = ""
+        self.alphabet = None
+        self.background = {}
+        self.sequences = []
+
+    def __getitem__(self, key):
+        """Return the motif of index key."""
+        if isinstance(key, str):
+            for motif in self:
+                if motif.name == key:
+                    return motif
+        else:
+            return list.__getitem__(self, key)
+
+
+# Everything below is private
+
+
+def _read_background(record, handle):
+    """Read background letter frequencies (PRIVATE)."""
+    for line in handle:
+        if line.startswith("Background letter frequencies"):
+            break
+    else:
+        raise ValueError(
+            "Improper input file. File should contain a line starting background frequencies."
+        )
+    try:
+        line = next(handle)
+    except StopIteration:
+        raise ValueError(
+            "Unexpected end of stream: Expected to find line starting background frequencies."
+        )
+    line = line.strip()
+    ls = line.split()
+    A, C, G, T = float(ls[1]), float(ls[3]), float(ls[5]), float(ls[7])
+    record.background = {"A": A, "C": C, "G": G, "T": T}
+
+
+def _read_version(record, handle):
+    """Read MEME version (PRIVATE)."""
+    for line in handle:
+        if line.startswith("MEME version"):
+            break
+    else:
+        raise ValueError(
+            "Improper input file. File should contain a line starting MEME version."
+        )
+    line = line.strip()
+    ls = line.split()
+    record.version = ls[2]
+
+
+def _read_alphabet(record, handle):
+    """Read alphabet (PRIVATE)."""
+    for line in handle:
+        if line.startswith("ALPHABET"):
+            break
+    else:
+        raise ValueError(
+            "Unexpected end of stream: Expected to find line starting with 'ALPHABET'"
+        )
+    if not line.startswith("ALPHABET= "):
+        raise ValueError("Line does not start with 'ALPHABET':\n%s" % line)
+    line = line.strip().replace("ALPHABET= ", "")
+    if line == "ACGT":
+        al = "ACGT"
+    else:
+        al = "ACDEFGHIKLMNPQRSTVWY"
+    record.alphabet = al
+
+
+def _read_lpm(handle, num_occurrences):
+    """Read letter probability matrix (PRIVATE)."""
+    counts = [[], [], [], []]
+    for line in handle:
+        freqs = line.split()
+        if len(freqs) != 4:
+            break
+        counts[0].append(round(float(freqs[0]) * num_occurrences))
+        counts[1].append(round(float(freqs[1]) * num_occurrences))
+        counts[2].append(round(float(freqs[2]) * num_occurrences))
+        counts[3].append(round(float(freqs[3]) * num_occurrences))
+    c = {}
+    c["A"] = counts[0]
+    c["C"] = counts[1]
+    c["G"] = counts[2]
+    c["T"] = counts[3]
+    return c
+
+
+def _read_motif_statistics(handle):
+    """Read motif statistics (PRIVATE)."""
+    # minimal :
+    #      letter-probability matrix: alength= 4 w= 19 nsites= 17 E= 4.1e-009
+    for line in handle:
+        if line.startswith("letter-probability matrix:"):
+            break
+    num_occurrences = int(line.split("nsites=")[1].split()[0])
+    length = int(line.split("w=")[1].split()[0])
+    evalue = float(line.split("E=")[1].split()[0])
+    return length, num_occurrences, evalue
+
+
+def _read_motif_name(handle):
+    """Read motif name (PRIVATE)."""
+    for line in handle:
+        if "sorted by position p-value" in line:
+            break
+    else:
+        raise ValueError("Unexpected end of stream: Failed to find motif name")
+    line = line.strip()
+    words = line.split()
+    name = " ".join(words[0:2])
+    return name
diff --git a/code/lib/Bio/motifs/pfm.py b/code/lib/Bio/motifs/pfm.py
new file mode 100644
index 0000000..588f089
--- /dev/null
+++ b/code/lib/Bio/motifs/pfm.py
@@ -0,0 +1,413 @@
+# Copyright 2015 by Gert Hulselmans.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Parse various position frequency matrix format files."""
+
+import re
+
+from Bio import motifs
+
+
+class Record(list):
+    """Class to store the information in a position frequency matrix table.
+
+    The record inherits from a list containing the individual motifs.
+    """
+
+    def __str__(self):
+        return "\n".join(str(motif) for motif in self)
+
+
+def read(handle, pfm_format):
+    """Read motif(s) from a file in various position frequency matrix formats.
+
+    Return the record of PFM(s).
+    Call the appropriate routine based on the format passed.
+    """
+    # Supporting underscores here for backward compatibility
+    pfm_format = pfm_format.lower().replace("_", "-")
+    if pfm_format == "pfm-four-columns":
+        record = _read_pfm_four_columns(handle)
+        return record
+    elif pfm_format == "pfm-four-rows":
+        record = _read_pfm_four_rows(handle)
+        return record
+    else:
+        raise ValueError("Unknown Position Frequency matrix format '%s'" % pfm_format)
+
+
+def _read_pfm_four_columns(handle):
+    """Read motifs in position frequency matrix format (4 columns) from a file handle.
+
+    # cisbp
+    Pos A   C   G   T
+    1   0.00961538461538462 0.00961538461538462 0.00961538461538462 0.971153846153846
+    2   0.00961538461538462 0.00961538461538462 0.00961538461538462 0.971153846153846
+    3   0.971153846153846   0.00961538461538462 0.00961538461538462 0.00961538461538462
+    4   0.00961538461538462 0.00961538461538462 0.00961538461538462 0.971153846153846
+    5   0.00961538461538462 0.971153846153846   0.00961538461538462 0.00961538461538462
+    6   0.971153846153846   0.00961538461538462 0.00961538461538462 0.00961538461538462
+    7   0.00961538461538462 0.971153846153846   0.00961538461538462 0.00961538461538462
+    8   0.00961538461538462 0.00961538461538462 0.00961538461538462 0.971153846153846
+
+    # c2h2 zfs
+    Gene    ENSG00000197372
+    Pos A   C   G   T
+    1   0.341303    0.132427    0.117054    0.409215
+    2   0.283785    0.077066    0.364552    0.274597
+    3   0.491055    0.078208    0.310520    0.120217
+    4   0.492621    0.076117    0.131007    0.300256
+    5   0.250645    0.361464    0.176504    0.211387
+    6   0.276694    0.498070    0.197793    0.027444
+    7   0.056317    0.014631    0.926202    0.002850
+    8   0.004470    0.007769    0.983797    0.003964
+    9   0.936213    0.058787    0.002387    0.002613
+    10  0.004352    0.004030    0.002418    0.989200
+    11  0.013277    0.008165    0.001991    0.976567
+    12  0.968132    0.002263    0.002868    0.026737
+    13  0.397623    0.052017    0.350783    0.199577
+    14  0.000000    0.000000    1.000000    0.000000
+    15  1.000000    0.000000    0.000000    0.000000
+    16  0.000000    0.000000    1.000000    0.000000
+    17  0.000000    0.000000    1.000000    0.000000
+    18  1.000000    0.000000    0.000000    0.000000
+    19  0.000000    1.000000    0.000000    0.000000
+    20  1.000000    0.000000    0.000000    0.000000
+
+    # c2h2 zfs
+    Gene    FBgn0000210
+    Motif   M1734_0.90
+    Pos A   C   G   T
+    1   0.25    0.0833333   0.0833333   0.583333
+    2   0.75    0.166667    0.0833333   0
+    3   0.833333    0   0   0.166667
+    4   1   0   0   0
+    5   0   0.833333    0.0833333   0.0833333
+    6   0.333333    0   0   0.666667
+    7   0.833333    0   0   0.166667
+    8   0.5 0   0.333333    0.166667
+    9   0.5 0.0833333   0.166667    0.25
+    10  0.333333    0.25    0.166667    0.25
+    11  0.166667    0.25    0.416667    0.166667
+
+    # flyfactorsurvey (cluster buster)
+    >AbdA_Cell_FBgn0000014
+    1   3   0   14
+    0   0   0   18
+    16  0   0   2
+    18  0   0   0
+    1   0   0   17
+    0   0   6   12
+    15  1   2   0
+
+    # homer
+    >ATGACTCATC AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer    6.049537    -1.782996e+03   0   9805.3,5781.0,3085.1,2715.0,0.00e+00
+    0.419   0.275   0.277   0.028
+    0.001   0.001   0.001   0.997
+    0.010   0.002   0.965   0.023
+    0.984   0.003   0.001   0.012
+    0.062   0.579   0.305   0.054
+    0.026   0.001   0.001   0.972
+    0.043   0.943   0.001   0.012
+    0.980   0.005   0.001   0.014
+    0.050   0.172   0.307   0.471
+    0.149   0.444   0.211   0.195
+
+    # hocomoco
+    > AHR_si
+    40.51343240527031  18.259112547756697  56.41253757072521  38.77363485291994
+    10.877470982533044  11.870876719950774  34.66312982331297  96.54723985087516
+    21.7165707818416  43.883079837598544  20.706746561638717  67.6523201955933
+    2.5465132509466635  1.3171620263517245  145.8637051322628  4.231336967110781
+    0.0  150.35847450464382  1.4927836298652875  2.1074592421627525
+    3.441039751299748  0.7902972158110341  149.37613720253387  0.3512432070271259
+    0.0  3.441039751299748  0.7024864140542533  149.81519121131782
+    0.0  0.0  153.95871737667187  0.0
+    43.07922333291745  66.87558226865211  16.159862546986584  27.844049228115868
+
+    # neph
+    UW.Motif.0001   atgactca
+    0.772949    0.089579    0.098612    0.038860
+    0.026652    0.004653    0.025056    0.943639
+    0.017663    0.023344    0.918728    0.040264
+    0.919596    0.025414    0.029759    0.025231
+    0.060312    0.772259    0.104968    0.062462
+    0.037406    0.020643    0.006667    0.935284
+    0.047316    0.899024    0.026928    0.026732
+    0.948639    0.019497    0.005737    0.026128
+
+    # tiffin
+    T   A   G   C
+    30  0   28  40
+    0   0   0   99
+    0   55  14  29
+    0   99  0   0
+    20  78  0   0
+    0   52  7   39
+    19  46  11  22
+    0   60  38  0
+    0   33  0   66
+    73  0   25  0
+    99  0   0   0
+    """
+    record = Record()
+
+    motif_name = None
+    motif_nbr = 0
+    motif_nbr_added = 0
+
+    default_nucleotide_order = ["A", "C", "G", "T"]
+    nucleotide_order = default_nucleotide_order
+    nucleotide_counts = {"A": [], "C": [], "G": [], "T": []}
+
+    for line in handle:
+        line = line.strip()
+
+        if line:
+            columns = line.split()
+            nbr_columns = len(columns)
+
+            if line.startswith("#"):
+                # Skip comment lines.
+                continue
+            elif line.startswith(">"):
+                # Parse ">AbdA_Cell_FBgn0000014" and "> AHR_si" like lines and put the part after ">" as motif name.
+                if motif_nbr != 0 and motif_nbr_added != motif_nbr:
+                    # Add the previous motif to the record.
+                    motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+                    motif.name = motif_name
+                    record.append(motif)
+                    motif_nbr_added = motif_nbr
+
+                # Reinitialize variables for the new motif.
+                motif_name = line[1:].strip()
+                nucleotide_order = default_nucleotide_order
+            elif columns[0] == "Gene":
+                # Parse "Gene   ENSG00000197372" like lines and put the gene name as motif name.
+                if motif_nbr != 0 and motif_nbr_added != motif_nbr:
+                    # Add the previous motif to the record.
+                    motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+                    motif.name = motif_name
+                    record.append(motif)
+                    motif_nbr_added = motif_nbr
+
+                # Reinitialize variables for the new motif.
+                motif_name = columns[1]
+                nucleotide_order = default_nucleotide_order
+            elif columns[0] == "Motif":
+                # Parse "Motif  M1734_0.90" like lines.
+                if motif_nbr != 0 and motif_nbr_added != motif_nbr:
+                    # Add the previous motif to the record.
+                    motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+                    motif.name = motif_name
+                    record.append(motif)
+                    motif_nbr_added = motif_nbr
+
+                # Reinitialize variables for the new motif.
+                motif_name = columns[1]
+                nucleotide_order = default_nucleotide_order
+            elif columns[0] == "Pos":
+                # Parse "Pos    A   C   G   T" like lines and change nucleotide order if necessary.
+                if nbr_columns == 5:
+                    # If the previous line was not a "Gene  ENSG00000197372" like line, a new motif starts here.
+                    if motif_nbr != 0 and motif_nbr_added != motif_nbr:
+                        # Add the previous motif to the record.
+                        motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+                        motif.name = motif_name
+                        record.append(motif)
+                        motif_nbr_added = motif_nbr
+
+                    nucleotide_order = default_nucleotide_order
+
+                    if set(columns[1:]) == set(default_nucleotide_order):
+                        nucleotide_order = columns[1:]
+            elif columns[0] in default_nucleotide_order:
+                # Parse "A  C   G   T" like lines and change nucleotide order if necessary.
+                if nbr_columns == 4:
+                    nucleotide_order = default_nucleotide_order
+                    if set(columns) == set(default_nucleotide_order):
+                        nucleotide_order = columns
+            else:
+                # Parse matrix columns lines and use the correct nucleotide order.
+                if nbr_columns == 4:
+                    matrix_columns = columns
+                elif nbr_columns == 5:
+                    matrix_columns = columns[1:]
+                else:
+                    continue
+
+                if motif_nbr == motif_nbr_added:
+                    # A new motif matrix starts here, so reinitialize variables for the new motif.
+                    nucleotide_counts = {"A": [], "C": [], "G": [], "T": []}
+                    motif_nbr += 1
+
+                [
+                    nucleotide_counts[nucleotide].append(float(nucleotide_count))
+                    for nucleotide, nucleotide_count in zip(
+                        nucleotide_order, matrix_columns
+                    )
+                ]
+        else:
+            # Empty lines can be separators between motifs.
+            if motif_nbr != 0 and motif_nbr_added != motif_nbr:
+                # Add the previous motif to the record.
+                motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+                motif.name = motif_name
+                record.append(motif)
+                motif_nbr_added = motif_nbr
+
+            # Reinitialize variables for the new motif.
+            motif_name = None
+            nucleotide_order = default_nucleotide_order
+            # nucleotide_counts = {'A': [], 'C': [], 'G': [], 'T': []}
+
+    if motif_nbr != 0 and motif_nbr_added != motif_nbr:
+        motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+        motif.name = motif_name
+        record.append(motif)
+
+    return record
+
+
+def _read_pfm_four_rows(handle):
+    """Read motifs in position frequency matrix format (4 rows) from a file handle.
+
+    # hdpi
+    A   0   5   6   5   1   0
+    C   1   1   0   0   0   4
+    G   5   0   0   0   3   0
+    T   0   0   0   1   2   2
+
+    # yetfasco
+    A   0.5 0.0 0.0 0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.5 0.0 0.0833333334583333
+    T   0.0 0.0 0.0 0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.0 0.0 0.0833333334583333
+    G   0.0 1.0 0.0 0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.0 1.0 0.249999999875
+    C   0.5 0.0 1.0 0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.25    0.5 0.0 0.583333333208333
+
+    # flyfactorsurvey ZFP finger
+    A |     92    106    231    135      0      1    780     28      0    700    739     94     60    127    130
+    C |    138     82    129     81    774      1      3      1      0      6     17     49    193    122    148
+    G |    270    398     54    164      7    659      1    750    755     65      1     41    202    234    205
+    T |    290    204    375    411      9    127      6     11     36     20     31    605    335    307    308
+
+    # scertf pcm
+    A | 9 1 1 97 1 94
+    T | 80 1 97 1 1 2
+    C | 9 97 1 1 1 2
+    G | 2 1 1 1 97 2
+
+    # scertf pfm
+    A | 0.090 0.010 0.010 0.970 0.010 0.940
+    C | 0.090 0.970 0.010 0.010 0.010 0.020
+    G | 0.020 0.010 0.010 0.010 0.970 0.020
+    T | 0.800 0.010 0.970 0.010 0.010 0.020
+
+    # idmmpmm
+    > abd-A
+    0.218451749734889 0.0230646871686108 0.656680805938494 0.898197242841994 0.040694591728526 0.132953340402969 0.74907211028632 0.628313891834571
+    0.0896076352067868 0.317338282078473 0.321580063626723 0.0461293743372216 0.0502386002120891 0.040694591728526 0.0284994697773065 0.0339342523860021
+    0.455991516436904 0.0691940615058324 0.0108695652173913 0.0217391304347826 0.0284994697773065 0.0284994697773065 0.016304347826087 0.160127253446448
+    0.235949098621421 0.590402969247084 0.0108695652173913 0.0339342523860021 0.880567338282079 0.797852598091198 0.206124072110286 0.17762460233298
+
+    # JASPAR
+        >MA0001.1 AGL3
+        A  [ 0  3 79 40 66 48 65 11 65  0 ]
+        C  [94 75  4  3  1  2  5  2  3  3 ]
+        G  [ 1  0  3  4  1  0  5  3 28 88 ]
+        T  [ 2 19 11 50 29 47 22 81  1  6 ]
+
+    or::
+
+        >MA0001.1 AGL3
+        0  3 79 40 66 48 65 11 65  0
+        94 75  4  3  1  2  5  2  3  3
+        1  0  3  4  1  0  5  3 28 88
+        2 19 11 50 29 47 22 81  1  6
+    """
+    record = Record()
+
+    name_pattern = re.compile(r"^>\s*(.+)\s*")
+    row_pattern_with_nucleotide_letter = re.compile(
+        r"\s*([ACGT])\s*[\[|]*\s*([0-9.\-eE\s]+)\s*\]*\s*"
+    )
+    row_pattern_without_nucleotide_letter = re.compile(r"\s*([0-9.\-eE\s]+)\s*")
+
+    motif_name = None
+    nucleotide_counts = {}
+    row_count = 0
+    nucleotides = ["A", "C", "G", "T"]
+
+    for line in handle:
+        line = line.strip()
+
+        name_match = name_pattern.match(line)
+        row_match_with_nucleotide_letter = row_pattern_with_nucleotide_letter.match(
+            line
+        )
+        row_match_without_nucleotide_letter = row_pattern_without_nucleotide_letter.match(
+            line
+        )
+
+        if name_match:
+            motif_name = name_match.group(1)
+        elif row_match_with_nucleotide_letter:
+            (nucleotide, counts_str) = row_match_with_nucleotide_letter.group(1, 2)
+            current_nucleotide_counts = counts_str.split()
+            nucleotide_counts[nucleotide] = [
+                float(current_nucleotide_count)
+                for current_nucleotide_count in current_nucleotide_counts
+            ]
+            row_count += 1
+            if row_count == 4:
+                motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+
+                if motif_name:
+                    motif.name = motif_name
+
+                record.append(motif)
+
+                motif_name = None
+                nucleotide_counts = {}
+                row_count = 0
+        elif row_match_without_nucleotide_letter:
+            current_nucleotide_counts = row_match_without_nucleotide_letter.group(
+                1
+            ).split()
+            nucleotide_counts[nucleotides[row_count]] = [
+                float(current_nucleotide_count)
+                for current_nucleotide_count in current_nucleotide_counts
+            ]
+            row_count += 1
+            if row_count == 4:
+                motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+
+                if motif_name:
+                    motif.name = motif_name
+
+                record.append(motif)
+
+                motif_name = None
+                nucleotide_counts = {}
+                row_count = 0
+
+    return record
+
+
+def write(motifs):
+    """Return the representation of motifs in Cluster Buster position frequency matrix format."""
+    lines = []
+    for m in motifs:
+        line = f">{m.name}\n"
+        lines.append(line)
+        for ACGT_counts in zip(
+            m.counts["A"], m.counts["C"], m.counts["G"], m.counts["T"]
+        ):
+            lines.append("{:0.0f}\t{:0.0f}\t{:0.0f}\t{:0.0f}\n".format(*ACGT_counts))
+
+    # Finished; glue the lines together.
+    text = "".join(lines)
+
+    return text
diff --git a/code/lib/Bio/motifs/thresholds.py b/code/lib/Bio/motifs/thresholds.py
new file mode 100644
index 0000000..196b0a3
--- /dev/null
+++ b/code/lib/Bio/motifs/thresholds.py
@@ -0,0 +1,109 @@
+# Copyright 2008 by Norbert Dojer.  All rights reserved.
+# Adapted by Bartek Wilczynski.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+"""Approximate calculation of appropriate thresholds for motif finding."""
+
+
+class ScoreDistribution:
+    """Class representing approximate score distribution for a given motif.
+
+    Utilizes a dynamic programming approach to calculate the distribution of
+    scores with a predefined precision. Provides a number of methods for calculating
+    thresholds for motif occurrences.
+    """
+
+    def __init__(self, motif=None, precision=10 ** 3, pssm=None, background=None):
+        """Initialize the class."""
+        if pssm is None:
+            self.min_score = min(0.0, motif.min_score())
+            self.interval = max(0.0, motif.max_score()) - self.min_score
+            self.n_points = precision * motif.length
+            self.ic = motif.ic()
+        else:
+            self.min_score = min(0.0, pssm.min)
+            self.interval = max(0.0, pssm.max) - self.min_score
+            self.n_points = precision * pssm.length
+            self.ic = pssm.mean(background)
+        self.step = self.interval / (self.n_points - 1)
+        self.mo_density = [0.0] * self.n_points
+        self.mo_density[-self._index_diff(self.min_score)] = 1.0
+        self.bg_density = [0.0] * self.n_points
+        self.bg_density[-self._index_diff(self.min_score)] = 1.0
+        if pssm is None:
+            for lo, mo in zip(motif.log_odds(), motif.pwm()):
+                self.modify(lo, mo, motif.background)
+        else:
+            for position in range(pssm.length):
+                mo_new = [0.0] * self.n_points
+                bg_new = [0.0] * self.n_points
+                lo = pssm[:, position]
+                for letter, score in lo.items():
+                    bg = background[letter]
+                    mo = pow(2, pssm[letter, position]) * bg
+                    d = self._index_diff(score)
+                    for i in range(self.n_points):
+                        mo_new[self._add(i, d)] += self.mo_density[i] * mo
+                        bg_new[self._add(i, d)] += self.bg_density[i] * bg
+                self.mo_density = mo_new
+                self.bg_density = bg_new
+
+    def _index_diff(self, x, y=0.0):
+        return int((x - y + 0.5 * self.step) // self.step)
+
+    def _add(self, i, j):
+        return max(0, min(self.n_points - 1, i + j))
+
+    def modify(self, scores, mo_probs, bg_probs):
+        """Modify motifs and background density."""
+        mo_new = [0.0] * self.n_points
+        bg_new = [0.0] * self.n_points
+        for k, v in scores.items():
+            d = self._index_diff(v)
+            for i in range(self.n_points):
+                mo_new[self._add(i, d)] += self.mo_density[i] * mo_probs[k]
+                bg_new[self._add(i, d)] += self.bg_density[i] * bg_probs[k]
+        self.mo_density = mo_new
+        self.bg_density = bg_new
+
+    def threshold_fpr(self, fpr):
+        """Approximate the log-odds threshold which makes the type I error (false positive rate)."""
+        i = self.n_points
+        prob = 0.0
+        while prob < fpr:
+            i -= 1
+            prob += self.bg_density[i]
+        return self.min_score + i * self.step
+
+    def threshold_fnr(self, fnr):
+        """Approximate the log-odds threshold which makes the type II error (false negative rate)."""
+        i = -1
+        prob = 0.0
+        while prob < fnr:
+            i += 1
+            prob += self.mo_density[i]
+        return self.min_score + i * self.step
+
+    def threshold_balanced(self, rate_proportion=1.0, return_rate=False):
+        """Approximate log-odds threshold making FNR equal to FPR times rate_proportion."""
+        i = self.n_points
+        fpr = 0.0
+        fnr = 1.0
+        while fpr * rate_proportion < fnr:
+            i -= 1
+            fpr += self.bg_density[i]
+            fnr -= self.mo_density[i]
+        if return_rate:
+            return self.min_score + i * self.step, fpr
+        else:
+            return self.min_score + i * self.step
+
+    def threshold_patser(self):
+        """Threshold selection mimicking the behaviour of patser (Hertz, Stormo 1999) software.
+
+        It selects such a threshold that the log(fpr)=-ic(M)
+        note: the actual patser software uses natural logarithms instead of log_2, so the numbers
+        are not directly comparable.
+        """
+        return self.threshold_fpr(fpr=2 ** -self.ic)
diff --git a/code/lib/Bio/motifs/transfac.py b/code/lib/Bio/motifs/transfac.py
new file mode 100644
index 0000000..927cd2b
--- /dev/null
+++ b/code/lib/Bio/motifs/transfac.py
@@ -0,0 +1,325 @@
+# Copyright 2003 by Bartek Wilczynski.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Parsing TRANSFAC files."""
+
+
+from Bio import motifs
+
+
+class Motif(motifs.Motif, dict):
+    """Store the information for one TRANSFAC motif.
+
+    This class inherits from the Bio.motifs.Motif base class, as well
+    as from a Python dictionary. All motif information found by the parser
+    is stored as attributes of the base class when possible; see the
+    Bio.motifs.Motif base class for a description of these attributes. All
+    other information associated with the motif is stored as (key, value)
+    pairs in the dictionary, where the key is the two-letter fields as found
+    in the TRANSFAC file. References are an exception: These are stored in
+    the .references attribute.
+
+    These fields are commonly found in TRANSFAC files::
+
+        AC:    Accession number
+        AS:    Accession numbers, secondary
+        BA:    Statistical basis
+        BF:    Binding factors
+        BS:    Factor binding sites underlying the matrix
+               [sequence; SITE accession number; start position for matrix
+               sequence; length of sequence used; number of gaps inserted;
+               strand orientation.]
+        CC:    Comments
+        CO:    Copyright notice
+        DE:    Short factor description
+        DR:    External databases
+               [database name: database accession number]
+        DT:    Date created/updated
+        HC:    Subfamilies
+        HP:    Superfamilies
+        ID:    Identifier
+        NA:    Name of the binding factor
+        OC:    Taxonomic classification
+        OS:    Species/Taxon
+        OV:    Older version
+        PV:    Preferred version
+        TY:    Type
+        XX:    Empty line; these are not stored in the Record.
+
+    References are stored in an .references attribute, which is a list of
+    dictionaries with the following keys::
+
+        RN:    Reference number
+        RA:    Reference authors
+        RL:    Reference data
+        RT:    Reference title
+        RX:    PubMed ID
+
+    For more information, see the TRANSFAC documentation.
+    """
+
+    multiple_value_keys = {"BF", "OV", "HP", "BS", "HC", "DT", "DR"}
+    # These keys can occur multiple times for one motif
+
+    reference_keys = {"RX", "RA", "RT", "RL"}
+    # These keys occur for references
+
+
+class Record(list):
+    """Store the information in a TRANSFAC matrix table.
+
+    The record inherits from a list containing the individual motifs.
+
+    Attributes:
+     - version - The version number, corresponding to the 'VV' field
+       in the TRANSFAC file;
+
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.version = None
+
+    def __str__(self):
+        """Turn the TRANSFAC matrix into a string."""
+        return write(self)
+
+
+def read(handle, strict=True):
+    """Parse a transfac format handle into a Record object."""
+    annotations = {}
+    references = []
+    counts = None
+    record = Record()
+    for line in handle:
+        line = line.strip()
+        if not line:
+            continue
+        key_value = line.split(None, 1)
+        key = key_value[0].strip()
+        if strict:
+            if len(key) != 2:
+                raise ValueError(
+                    "The key value of a TRANSFAC motif line should have 2 characters:"
+                    f'"{line}"'
+                )
+        if len(key_value) == 2:
+            value = key_value[1].strip()
+            if strict:
+                if not line.partition("  ")[1]:
+                    raise ValueError(
+                        "A TRANSFAC motif line should have 2 "
+                        "spaces between key and value columns: "
+                        f'"{line}"'
+                    )
+        if key == "VV":
+            record.version = value
+        elif key in ("P0", "PO"):  # Old TRANSFAC files use PO instead of P0
+            counts = {}
+            if value.split()[:4] != ["A", "C", "G", "T"]:
+                raise ValueError(
+                    f'A TRANSFAC matrix "{key}" line should be '
+                    f'followed by "A C G T": {line}'
+                )
+            length = 0
+            for c in "ACGT":
+                counts[c] = []
+            for line in handle:
+                line = line.strip()
+                key_value = line.split(None, 1)
+                key = key_value[0].strip()
+                if len(key_value) == 2:
+                    value = key_value[1].strip()
+                    if strict:
+                        if not line.partition("  ")[1]:
+                            raise ValueError(
+                                "A TRANSFAC motif line should have 2 spaces"
+                                f' between key and value columns: "{line}"'
+                            )
+                try:
+                    i = int(key)
+                except ValueError:
+                    break
+                if length == 0 and i == 0:
+                    if strict:
+                        raise ValueError(
+                            'A TRANSFAC matrix should start with "01" as first row'
+                            f' of the matrix, but this matrix uses "00": "{line}'
+                        )
+                else:
+                    length += 1
+                if i != length:
+                    raise ValueError(
+                        "The TRANSFAC matrix row number does not match the position"
+                        f' in the matrix: "{line}"'
+                    )
+                if strict:
+                    if len(key) == 1:
+                        raise ValueError(
+                            "A TRANSFAC matrix line should have a 2 digit"
+                            f' key at the start of the line ("{i:02d}"),'
+                            f' but this matrix uses "{i:d}": "{line:s}".'
+                        )
+                    if len(key_value) != 2:
+                        raise ValueError(
+                            "A TRANSFAC matrix line should have a key and a"
+                            f' value: "{line}"'
+                        )
+                values = value.split()[:4]
+                if len(values) != 4:
+                    raise ValueError(
+                        "A TRANSFAC matrix line should have a value for each"
+                        f' nucleotide (A, C, G and T): "{line}"'
+                    )
+                for c, v in zip("ACGT", values):
+                    counts[c].append(float(v))
+        if line == "XX":
+            pass
+        elif key == "RN":
+            index, separator, accession = value.partition(";")
+            if index[0] != "[":
+                raise ValueError(
+                    f'The index "{index}" in a TRANSFAC RN line should start'
+                    f' with a "[": "{line}"'
+                )
+            if index[-1] != "]":
+                raise ValueError(
+                    f'The index "{index}" in a TRANSFAC RN line should end'
+                    f' with a "]": "{line}"'
+                )
+            index = int(index[1:-1])
+            if len(references) != index - 1:
+                raise ValueError(
+                    f'The index "{index:d}" of the TRANSFAC RN line does not '
+                    "match the current number of seen references "
+                    f'"{len(references) + 1:d}": "{line:s}"'
+                )
+            reference = {key: value}
+            references.append(reference)
+        elif key == "//":
+            if counts is not None:
+                motif = Motif(alphabet="ACGT", counts=counts)
+                motif.update(annotations)
+                motif.references = references
+                record.append(motif)
+            annotations = {}
+            references = []
+        elif key in Motif.reference_keys:
+            reference[key] = value
+        elif key in Motif.multiple_value_keys:
+            if key not in annotations:
+                annotations[key] = []
+            annotations[key].append(value)
+        else:
+            annotations[key] = value
+    return record
+
+
+def write(motifs):
+    """Write the representation of a motif in TRANSFAC format."""
+    blocks = []
+    try:
+        version = motifs.version
+    except AttributeError:
+        pass
+    else:
+        if version is not None:
+            block = (
+                """\
+VV  %s
+XX
+//
+"""
+                % version
+            )
+            blocks.append(block)
+    multiple_value_keys = Motif.multiple_value_keys
+    sections = (
+        ("AC", "AS"),  # Accession
+        ("ID",),  # ID
+        ("DT", "CO"),  # Date, copyright
+        ("NA",),  # Name
+        ("DE",),  # Short factor description
+        ("TY",),  # Type
+        ("OS", "OC"),  # Organism
+        ("HP", "HC"),  # Superfamilies, subfamilies
+        ("BF",),  # Binding factors
+        ("P0",),  # Frequency matrix
+        ("BA",),  # Statistical basis
+        ("BS",),  # Factor binding sites
+        ("CC",),  # Comments
+        ("DR",),  # External databases
+        ("OV", "PV"),  # Versions
+    )
+    for motif in motifs:
+        lines = []
+        for section in sections:
+            blank = False
+            for key in section:
+                if key == "P0":
+                    # Frequency matrix
+                    length = motif.length
+                    if length == 0:
+                        continue
+                    sequence = motif.degenerate_consensus
+                    letters = sorted(motif.alphabet)
+                    line = "      ".join(["P0"] + letters)
+
+                    lines.append(line)
+                    for i in range(length):
+                        line = (
+                            " ".join(["%02.d"] + ["%6.20g" for _ in letters])
+                            + "      %s"
+                        )
+                        line = line % tuple(
+                            [i + 1]
+                            + [motif.counts[l][i] for l in letters]
+                            + [sequence[i]]
+                        )
+                        lines.append(line)
+                    blank = True
+                else:
+                    try:
+                        value = motif.get(key)
+                    except AttributeError:
+                        value = None
+                    if value is not None:
+                        if key in multiple_value_keys:
+                            for v in value:
+                                line = "%s  %s" % (key, v)
+                                lines.append(line)
+                        else:
+                            line = "%s  %s" % (key, value)
+                            lines.append(line)
+                        blank = True
+                if key == "PV":
+                    # References
+                    try:
+                        references = motif.references
+                    except AttributeError:
+                        pass
+                    else:
+                        keys = ("RN", "RX", "RA", "RT", "RL")
+                        for reference in references:
+                            for key in keys:
+                                value = reference.get(key)
+                                if value is None:
+                                    continue
+                                line = "%s  %s" % (key, value)
+                                lines.append(line)
+                                blank = True
+            if blank:
+                line = "XX"
+                lines.append(line)
+        # Finished this motif; glue the lines together
+        line = "//"
+        lines.append(line)
+        block = "\n".join(lines) + "\n"
+        blocks.append(block)
+    # Finished all motifs; glue the blocks together
+    text = "".join(blocks)
+    return text
diff --git a/code/lib/Bio/motifs/xms.py b/code/lib/Bio/motifs/xms.py
new file mode 100644
index 0000000..46643c1
--- /dev/null
+++ b/code/lib/Bio/motifs/xms.py
@@ -0,0 +1,105 @@
+# Copyright 2015 by Gert Hulselmans.  All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Parse XMS motif files."""
+
+from Bio import motifs
+
+
+from xml.dom import minidom, Node
+import re
+
+
+class XMSScanner:
+    """Class for scanning XMS XML file."""
+
+    def __init__(self, doc):
+        """Generate motif Record from xms document, an XML-like motif pfm file."""
+        self.record = Record()
+        for child in doc.getElementsByTagName("motif"):
+            if child.nodeType == Node.ELEMENT_NODE:
+                self.handle_motif(child)
+
+    def handle_motif(self, node):
+        """Read the motif's name and column from the node and add the motif record."""
+        motif_name = self.get_text(node.getElementsByTagName("name"))
+        nucleotide_counts = {"A": [], "C": [], "G": [], "T": []}
+
+        for column in node.getElementsByTagName("column"):
+            [
+                nucleotide_counts[nucleotide].append(float(nucleotide_count))
+                for nucleotide, nucleotide_count in zip(
+                    ["A", "C", "G", "T"], self.get_acgt(column)
+                )
+            ]
+
+        motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts)
+        motif.name = motif_name
+
+        self.record.append(motif)
+
+    def get_property_value(self, node, key_name):
+        """Extract the value of the motif's property named key_name from node."""
+        for cur_property in node.getElementsByTagName("prop"):
+            right_property = False
+            cur_value = None
+            for child in cur_property.childNodes:
+                if child.nodeType != Node.ELEMENT_NODE:
+                    continue
+                if child.tagName == "key" and self.get_text([child]) == key_name:
+                    right_property = True
+                if child.tagName == "value":
+                    cur_value = self.get_text([child])
+            if right_property:
+                return cur_value
+        return None
+
+    def get_acgt(self, node):
+        """Get and return the motif's weights of A, C, G, T."""
+        a, c, g, t = 0.0, 0.0, 0.0, 0.0
+        for weight in node.getElementsByTagName("weight"):
+            if weight.getAttribute("symbol") == "adenine":
+                a = float(self.get_text([weight]))
+            elif weight.getAttribute("symbol") == "cytosine":
+                c = float(self.get_text([weight]))
+            elif weight.getAttribute("symbol") == "guanine":
+                g = float(self.get_text([weight]))
+            elif weight.getAttribute("symbol") == "thymine":
+                t = float(self.get_text([weight]))
+        return a, c, g, t
+
+    def get_text(self, nodelist):
+        """Return a string representation of the motif's properties listed on nodelist ."""
+        retlist = []
+        for node in nodelist:
+            if node.nodeType == Node.TEXT_NODE:
+                retlist.append(node.wholeText)
+            elif node.hasChildNodes:
+                retlist.append(self.get_text(node.childNodes))
+
+        return re.sub(r"\s+", " ", "".join(retlist))
+
+
+class Record(list):
+    """Class to store the information in a XMS matrix table.
+
+    The record inherits from a list containing the individual motifs.
+    """
+
+    def __str__(self):
+        return "\n".join(str(motif) for motif in self)
+
+
+def read(handle):
+    """Read motifs in XMS matrix format from a file handle.
+
+    XMS is an XML format for describing regulatory motifs and PSSMs.
+    This format was defined by Thomas Down, and used in the NestedMICA and MotifExplorer programs.
+    """
+    xms_doc = minidom.parse(handle)
+    record = XMSScanner(xms_doc).record
+
+    return record
diff --git a/code/lib/Bio/pairwise2.py b/code/lib/Bio/pairwise2.py
new file mode 100644
index 0000000..797ce20
--- /dev/null
+++ b/code/lib/Bio/pairwise2.py
@@ -0,0 +1,1431 @@
+# Copyright 2002 by Jeffrey Chang.
+# Copyright 2016, 2019, 2020 by Markus Piotrowski.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Pairwise sequence alignment using a dynamic programming algorithm.
+
+This provides functions to get global and local alignments between two
+sequences. A global alignment finds the best concordance between all
+characters in two sequences. A local alignment finds just the
+subsequences that align the best. Local alignments must have a positive
+score to be reported and they will not be extended for 'zero counting'
+matches. This means a local alignment will always start and end with
+a positive counting match.
+
+When doing alignments, you can specify the match score and gap
+penalties.  The match score indicates the compatibility between an
+alignment of two characters in the sequences. Highly compatible
+characters should be given positive scores, and incompatible ones
+should be given negative scores or 0.  The gap penalties should be
+negative.
+
+The names of the alignment functions in this module follow the
+convention
+XX
+where  is either "global" or "local" and XX is a 2
+character code indicating the parameters it takes.  The first
+character indicates the parameters for matches (and mismatches), and
+the second indicates the parameters for gap penalties.
+
+The match parameters are::
+
+    CODE  DESCRIPTION & OPTIONAL KEYWORDS
+    x     No parameters. Identical characters have score of 1, otherwise 0.
+    m     A match score is the score of identical chars, otherwise mismatch
+          score. Keywords ``match``, ``mismatch``.
+    d     A dictionary returns the score of any pair of characters.
+          Keyword ``match_dict``.
+    c     A callback function returns scores. Keyword ``match_fn``.
+
+The gap penalty parameters are::
+
+    CODE  DESCRIPTION & OPTIONAL KEYWORDS
+    x     No gap penalties.
+    s     Same open and extend gap penalties for both sequences.
+          Keywords ``open``, ``extend``.
+    d     The sequences have different open and extend gap penalties.
+          Keywords ``openA``, ``extendA``, ``openB``, ``extendB``.
+    c     A callback function returns the gap penalties.
+          Keywords ``gap_A_fn``, ``gap_B_fn``.
+
+All the different alignment functions are contained in an object
+``align``. For example:
+
+    >>> from Bio import pairwise2
+    >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG")
+
+For better readability, the required arguments can be used with optional keywords:
+
+    >>> alignments = pairwise2.align.globalxx(sequenceA="ACCGT", sequenceB="ACG")
+
+The result is a list of the alignments between the two strings. Each alignment
+is a named tuple consisting of the two aligned sequences, the score and the
+start and end positions of the alignment:
+
+   >>> print(alignments)
+   [Alignment(seqA='ACCGT', seqB='A-CG-', score=3.0, start=0, end=5), ...
+
+You can access each element of an alignment by index or name:
+
+   >>> alignments[0][2]
+   3.0
+   >>> alignments[0].score
+   3.0
+
+For a nice printout of an alignment, use the ``format_alignment`` method of
+the module:
+
+    >>> from Bio.pairwise2 import format_alignment
+    >>> print(format_alignment(*alignments[0]))
+    ACCGT
+    | || 
+    A-CG-
+      Score=3
+    
+
+All alignment functions have the following arguments:
+
+- Two sequences: strings, Biopython sequence objects or lists.
+  Lists are useful for supplying sequences which contain residues that are
+  encoded by more than one letter.
+
+- ``penalize_extend_when_opening``: boolean (default: False).
+  Whether to count an extension penalty when opening a gap. If false, a gap of
+  1 is only penalized an "open" penalty, otherwise it is penalized
+  "open+extend".
+
+- ``penalize_end_gaps``: boolean.
+  Whether to count the gaps at the ends of an alignment. By default, they are
+  counted for global alignments but not for local ones. Setting
+  ``penalize_end_gaps`` to (boolean, boolean) allows you to specify for the
+  two sequences separately whether gaps at the end of the alignment should be
+  counted.
+
+- ``gap_char``: string (default: ``'-'``).
+  Which character to use as a gap character in the alignment returned. If your
+  input sequences are lists, you must change this to ``['-']``.
+
+- ``force_generic``: boolean (default: False).
+  Always use the generic, non-cached, dynamic programming function (slow!).
+  For debugging.
+
+- ``score_only``: boolean (default: False).
+  Only get the best score, don't recover any alignments. The return value of
+  the function is the score. Faster and uses less memory.
+
+- ``one_alignment_only``: boolean (default: False).
+  Only recover one alignment.
+
+The other parameters of the alignment function depend on the function called.
+Some examples:
+
+- Find the best global alignment between the two sequences. Identical
+  characters are given 1 point. No points are deducted for mismatches or gaps.
+
+    >>> for a in pairwise2.align.globalxx("ACCGT", "ACG"):
+    ...     print(format_alignment(*a))
+    ACCGT
+    | || 
+    A-CG-
+      Score=3
+    
+    ACCGT
+    || | 
+    AC-G-
+      Score=3
+    
+
+- Same thing as before, but with a local alignment. Note that
+  ``format_alignment`` will only show the aligned parts of the sequences,
+  together with the starting positions.
+
+    >>> for a in pairwise2.align.localxx("ACCGT", "ACG"):
+    ...     print(format_alignment(*a))
+    1 ACCG
+      | ||
+    1 A-CG
+      Score=3
+    
+    1 ACCG
+      || |
+    1 AC-G
+      Score=3
+    
+
+  To restore the 'historic' behaviour of ``format_alignemt``, i.e., showing
+  also the un-aligned parts of both sequences, use the new keyword parameter
+  ``full_sequences``:
+
+    >>> for a in pairwise2.align.localxx("ACCGT", "ACG"):
+    ...     print(format_alignment(*a, full_sequences=True))
+    ACCGT
+    | || 
+    A-CG-
+      Score=3
+    
+    ACCGT
+    || | 
+    AC-G-
+      Score=3
+    
+
+
+- Do a global alignment. Identical characters are given 2 points, 1 point is
+  deducted for each non-identical character. Don't penalize gaps.
+
+    >>> for a in pairwise2.align.globalmx("ACCGT", "ACG", 2, -1):
+    ...     print(format_alignment(*a))
+    ACCGT
+    | || 
+    A-CG-
+      Score=6
+    
+    ACCGT
+    || | 
+    AC-G-
+      Score=6
+    
+
+- Same as above, except now 0.5 points are deducted when opening a gap, and
+  0.1 points are deducted when extending it.
+
+    >>> for a in pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1):
+    ...     print(format_alignment(*a))
+    ACCGT
+    | || 
+    A-CG-
+      Score=5
+    
+    ACCGT
+    || | 
+    AC-G-
+      Score=5
+    
+
+- Note that you can use keywords to increase the readability, e.g.:
+
+    >>> a = pairwise2.align.globalms("ACGT", "ACG", match=2, mismatch=-1, open=-.5,
+    ...                              extend=-.1)
+
+- Depending on the penalties, a gap in one sequence may be followed by a gap in
+  the other sequence.If you don't like this behaviour, increase the gap-open
+  penalty:
+
+    >>> for a in pairwise2.align.globalms("A", "T", 5, -4, -1, -.1):
+    ...     print(format_alignment(*a))
+    A-
+    
+    -T
+      Score=-2
+    
+    >>> for a in pairwise2.align.globalms("A", "T", 5, -4, -3, -.1):
+    ...	    print(format_alignment(*a))
+    A
+    .
+    T
+      Score=-4
+    
+
+- The alignment function can also use known matrices already included in
+  Biopython (in ``Bio.Align.substitution_matrices``):
+
+    >>> from Bio.Align import substitution_matrices
+    >>> matrix = substitution_matrices.load("BLOSUM62")
+    >>> for a in pairwise2.align.globaldx("KEVLA", "EVL", matrix):
+    ...     print(format_alignment(*a))
+    KEVLA
+     ||| 
+    -EVL-
+      Score=13
+    
+
+- With the parameter ``c`` you can define your own match- and gap functions.
+  E.g. to define an affine logarithmic gap function and using it:
+
+    >>> from math import log
+    >>> def gap_function(x, y):  # x is gap position in seq, y is gap length
+    ...     if y == 0:  # No gap
+    ...         return 0
+    ...     elif y == 1:  # Gap open penalty
+    ...         return -2
+    ...     return - (2 + y/4.0 + log(y)/2.0)
+    ...
+    >>> alignment = pairwise2.align.globalmc("ACCCCCGT", "ACG", 5, -4,
+    ...                                      gap_function, gap_function)
+
+  You can define different gap functions for each sequence.
+  Self-defined match functions must take the two residues to be compared and
+  return a score.
+
+To see a description of the parameters for a function, please look at
+the docstring for the function via the help function, e.g.
+type ``help(pairwise2.align.localds)`` at the Python prompt.
+
+"""  # noqa: W291
+
+import warnings
+from collections import namedtuple
+
+from Bio import BiopythonWarning
+
+
+MAX_ALIGNMENTS = 1000  # maximum alignments recovered in traceback
+
+
+class align:
+    """Provide functions that do alignments.
+
+    Alignment functions are called as:
+
+      pairwise2.align.globalXX
+
+    or
+
+      pairwise2.align.localXX
+
+    Where XX is a 2 character code indicating the match/mismatch parameters
+    (first character, either x, m, d or c) and the gap penalty parameters
+    (second character, either x, s, d, or c).
+
+    For a detailed description read the main module's docstring (e.g.,
+    type ``help(pairwise2)``).
+    To see a description of the parameters for a function, please
+    look at the docstring for the function, e.g. type
+    ``help(pairwise2.align.localds)`` at the Python prompt.
+    """
+
+    class alignment_function:
+        """Callable class which impersonates an alignment function.
+
+        The constructor takes the name of the function.  This class
+        will decode the name of the function to figure out how to
+        interpret the parameters.
+        """
+
+        # match code -> tuple of (parameters, docstring)
+        match2args = {
+            "x": ([], ""),
+            "m": (
+                ["match", "mismatch"],
+                "match is the score to given to identical characters.\n"
+                "mismatch is the score given to non-identical ones.",
+            ),
+            "d": (
+                ["match_dict"],
+                "match_dict is a dictionary where the keys are tuples\n"
+                "of pairs of characters and the values are the scores,\n"
+                "e.g. ('A', 'C') : 2.5.",
+            ),
+            "c": (
+                ["match_fn"],
+                "match_fn is a callback function that takes two "
+                "characters and returns the score between them.",
+            ),
+        }
+        # penalty code -> tuple of (parameters, docstring)
+        penalty2args = {
+            "x": ([], ""),
+            "s": (
+                ["open", "extend"],
+                "open and extend are the gap penalties when a gap is\n"
+                "opened and extended.  They should be negative.",
+            ),
+            "d": (
+                ["openA", "extendA", "openB", "extendB"],
+                "openA and extendA are the gap penalties for sequenceA,\n"
+                "and openB and extendB for sequenceB.  The penalties\n"
+                "should be negative.",
+            ),
+            "c": (
+                ["gap_A_fn", "gap_B_fn"],
+                "gap_A_fn and gap_B_fn are callback functions that takes\n"
+                "(1) the index where the gap is opened, and (2) the length\n"
+                "of the gap.  They should return a gap penalty.",
+            ),
+        }
+
+        def __init__(self, name):
+            """Check to make sure the name of the function is reasonable."""
+            if name.startswith("global"):
+                if len(name) != 8:
+                    raise AttributeError("function should be globalXX")
+            elif name.startswith("local"):
+                if len(name) != 7:
+                    raise AttributeError("function should be localXX")
+            else:
+                raise AttributeError(name)
+            align_type, match_type, penalty_type = name[:-2], name[-2], name[-1]
+            try:
+                match_args, match_doc = self.match2args[match_type]
+            except KeyError:
+                raise AttributeError("unknown match type %r" % match_type)
+            try:
+                penalty_args, penalty_doc = self.penalty2args[penalty_type]
+            except KeyError:
+                raise AttributeError("unknown penalty type %r" % penalty_type)
+
+            # Now get the names of the parameters to this function.
+            param_names = ["sequenceA", "sequenceB"]
+            param_names.extend(match_args)
+            param_names.extend(penalty_args)
+            self.function_name = name
+            self.align_type = align_type
+            self.param_names = param_names
+
+            self.__name__ = self.function_name
+            # Set the doc string.
+            doc = "%s(%s) -> alignments\n" % (
+                self.__name__,
+                ", ".join(self.param_names),
+            )
+            doc += """\
+\nThe following parameters can also be used with optional
+keywords of the same name.\n\n
+sequenceA and sequenceB must be of the same type, either
+strings, lists or Biopython sequence objects.\n
+"""
+            if match_doc:
+                doc += "\n%s\n" % match_doc
+            if penalty_doc:
+                doc += "\n%s\n" % penalty_doc
+            doc += """\
+\nalignments is a list of named tuples (seqA, seqB, score,
+begin, end). seqA and seqB are strings showing the alignment
+between the sequences.  score is the score of the alignment.
+begin and end are indexes of seqA and seqB that indicate
+where the alignment occurs.
+"""
+            self.__doc__ = doc
+
+        def decode(self, *args, **keywds):
+            """Decode the arguments for the _align function.
+
+            keywds will get passed to it, so translate the arguments
+            to this function into forms appropriate for _align.
+            """
+            keywds = keywds.copy()
+
+            # Replace possible "keywords" with arguments:
+            args += (len(self.param_names) - len(args)) * (None,)
+            for key in keywds.copy():
+                if key in self.param_names:
+                    _index = self.param_names.index(key)
+                    args = args[:_index] + (keywds[key],) + args[_index:]
+                    del keywds[key]
+            args = tuple(arg for arg in args if arg is not None)
+
+            if len(args) != len(self.param_names):
+                raise TypeError(
+                    "%s takes exactly %d argument (%d given)"
+                    % (self.function_name, len(self.param_names), len(args))
+                )
+
+            i = 0
+            while i < len(self.param_names):
+                if self.param_names[i] in [
+                    "sequenceA",
+                    "sequenceB",
+                    "gap_A_fn",
+                    "gap_B_fn",
+                    "match_fn",
+                ]:
+                    keywds[self.param_names[i]] = args[i]
+                    i += 1
+                elif self.param_names[i] == "match":
+                    assert self.param_names[i + 1] == "mismatch"
+                    match, mismatch = args[i], args[i + 1]
+                    keywds["match_fn"] = identity_match(match, mismatch)
+                    i += 2
+                elif self.param_names[i] == "match_dict":
+                    keywds["match_fn"] = dictionary_match(args[i])
+                    i += 1
+                elif self.param_names[i] == "open":
+                    assert self.param_names[i + 1] == "extend"
+                    open, extend = args[i], args[i + 1]
+                    pe = keywds.get("penalize_extend_when_opening", 0)
+                    keywds["gap_A_fn"] = affine_penalty(open, extend, pe)
+                    keywds["gap_B_fn"] = affine_penalty(open, extend, pe)
+                    i += 2
+                elif self.param_names[i] == "openA":
+                    assert self.param_names[i + 3] == "extendB"
+                    openA, extendA, openB, extendB = args[i : i + 4]
+                    pe = keywds.get("penalize_extend_when_opening", 0)
+                    keywds["gap_A_fn"] = affine_penalty(openA, extendA, pe)
+                    keywds["gap_B_fn"] = affine_penalty(openB, extendB, pe)
+                    i += 4
+                else:
+                    raise ValueError("unknown parameter %r" % self.param_names[i])
+
+            # Here are the default parameters for _align.  Assign
+            # these to keywds, unless already specified.
+            pe = keywds.get("penalize_extend_when_opening", 0)
+            default_params = [
+                ("match_fn", identity_match(1, 0)),
+                ("gap_A_fn", affine_penalty(0, 0, pe)),
+                ("gap_B_fn", affine_penalty(0, 0, pe)),
+                ("penalize_extend_when_opening", 0),
+                ("penalize_end_gaps", self.align_type == "global"),
+                ("align_globally", self.align_type == "global"),
+                ("gap_char", "-"),
+                ("force_generic", 0),
+                ("score_only", 0),
+                ("one_alignment_only", 0),
+            ]
+            for name, default in default_params:
+                keywds[name] = keywds.get(name, default)
+            value = keywds["penalize_end_gaps"]
+            try:
+                n = len(value)
+            except TypeError:
+                keywds["penalize_end_gaps"] = tuple([value] * 2)
+            else:
+                assert n == 2
+            return keywds
+
+        def __call__(self, *args, **keywds):
+            """Call the alignment instance already created."""
+            keywds = self.decode(*args, **keywds)
+            return _align(**keywds)
+
+    def __getattr__(self, attr):
+        """Call alignment_function() to check and decode the attributes."""
+        # The following 'magic' is needed to rewrite the class docstring
+        # dynamically:
+        wrapper = self.alignment_function(attr)
+        wrapper_type = type(wrapper)
+        wrapper_dict = wrapper_type.__dict__.copy()
+        wrapper_dict["__doc__"] = wrapper.__doc__
+        new_alignment_function = type("alignment_function", (object,), wrapper_dict)
+
+        return new_alignment_function(attr)
+
+
+align = align()
+
+
+def _align(
+    sequenceA,
+    sequenceB,
+    match_fn,
+    gap_A_fn,
+    gap_B_fn,
+    penalize_extend_when_opening,
+    penalize_end_gaps,
+    align_globally,
+    gap_char,
+    force_generic,
+    score_only,
+    one_alignment_only,
+):
+    """Return optimal alignments between two sequences (PRIVATE).
+
+    This method either returns a list of optimal alignments (with the same
+    score) or just the optimal score.
+    """
+    if not sequenceA or not sequenceB:
+        return []
+    try:
+        sequenceA + gap_char
+        sequenceB + gap_char
+    except TypeError:
+        raise TypeError(
+            "both sequences must be of the same type, either "
+            "string/sequence object or list. Gap character must "
+            "fit the sequence type (string or list)"
+        )
+
+    if not isinstance(sequenceA, list):
+        sequenceA = str(sequenceA)
+    if not isinstance(sequenceB, list):
+        sequenceB = str(sequenceB)
+    if not align_globally and (penalize_end_gaps[0] or penalize_end_gaps[1]):
+        warnings.warn(
+            '"penalize_end_gaps" should not be used in local '
+            "alignments. The resulting score may be wrong.",
+            BiopythonWarning,
+        )
+
+    if (
+        (not force_generic)
+        and isinstance(gap_A_fn, affine_penalty)
+        and isinstance(gap_B_fn, affine_penalty)
+    ):
+        open_A, extend_A = gap_A_fn.open, gap_A_fn.extend
+        open_B, extend_B = gap_B_fn.open, gap_B_fn.extend
+        matrices = _make_score_matrix_fast(
+            sequenceA,
+            sequenceB,
+            match_fn,
+            open_A,
+            extend_A,
+            open_B,
+            extend_B,
+            penalize_extend_when_opening,
+            penalize_end_gaps,
+            align_globally,
+            score_only,
+        )
+    else:
+        matrices = _make_score_matrix_generic(
+            sequenceA,
+            sequenceB,
+            match_fn,
+            gap_A_fn,
+            gap_B_fn,
+            penalize_end_gaps,
+            align_globally,
+            score_only,
+        )
+
+    score_matrix, trace_matrix, best_score = matrices
+
+    # print("SCORE %s" % print_matrix(score_matrix))
+    # print("TRACEBACK %s" % print_matrix(trace_matrix))
+
+    # If they only want the score, then return it.
+    if score_only:
+        return best_score
+
+    starts = _find_start(score_matrix, best_score, align_globally)
+
+    # Recover the alignments and return them.
+    alignments = _recover_alignments(
+        sequenceA,
+        sequenceB,
+        starts,
+        best_score,
+        score_matrix,
+        trace_matrix,
+        align_globally,
+        gap_char,
+        one_alignment_only,
+        gap_A_fn,
+        gap_B_fn,
+    )
+    if not alignments:
+        # This may happen, see recover_alignments for explanation
+        score_matrix, trace_matrix = _reverse_matrices(score_matrix, trace_matrix)
+        starts = [(z, (y, x)) for z, (x, y) in starts]
+        alignments = _recover_alignments(
+            sequenceB,
+            sequenceA,
+            starts,
+            best_score,
+            score_matrix,
+            trace_matrix,
+            align_globally,
+            gap_char,
+            one_alignment_only,
+            gap_B_fn,
+            gap_A_fn,
+            reverse=True,
+        )
+    return alignments
+
+
+def _make_score_matrix_generic(
+    sequenceA,
+    sequenceB,
+    match_fn,
+    gap_A_fn,
+    gap_B_fn,
+    penalize_end_gaps,
+    align_globally,
+    score_only,
+):
+    """Generate a score and traceback matrix (PRIVATE).
+
+    This implementation according to Needleman-Wunsch allows the usage of
+    general gap functions and is rather slow. It is automatically called if
+    you define your own gap functions. You can force the usage of this method
+    with ``force_generic=True``.
+    """
+    local_max_score = 0
+    # Create the score and traceback matrices. These should be in the
+    # shape:
+    # sequenceA (down) x sequenceB (across)
+    lenA, lenB = len(sequenceA), len(sequenceB)
+    score_matrix, trace_matrix = [], []
+    for i in range(lenA + 1):
+        score_matrix.append([None] * (lenB + 1))
+        if not score_only:
+            trace_matrix.append([None] * (lenB + 1))
+
+    # Initialize first row and column with gap scores. This is like opening up
+    # i gaps at the beginning of sequence A or B.
+    for i in range(lenA + 1):
+        if penalize_end_gaps[1]:  # [1]:gap in sequence B
+            score = gap_B_fn(0, i)
+        else:
+            score = 0.0
+        score_matrix[i][0] = score
+    for i in range(lenB + 1):
+        if penalize_end_gaps[0]:  # [0]:gap in sequence A
+            score = gap_A_fn(0, i)
+        else:
+            score = 0.0
+        score_matrix[0][i] = score
+
+    # Fill in the score matrix.  Each position in the matrix
+    # represents an alignment between a character from sequence A to
+    # one in sequence B.  As I iterate through the matrix, find the
+    # alignment by choose the best of:
+    #    1) extending a previous alignment without gaps
+    #    2) adding a gap in sequenceA
+    #    3) adding a gap in sequenceB
+    for row in range(1, lenA + 1):
+        for col in range(1, lenB + 1):
+            # First, calculate the score that would occur by extending
+            # the alignment without gaps.
+            # fmt: off
+            nogap_score = (
+                score_matrix[row - 1][col - 1]
+                + match_fn(sequenceA[row - 1], sequenceB[col - 1])
+            )
+
+            # fmt: on
+            # Try to find a better score by opening gaps in sequenceA.
+            # Do this by checking alignments from each column in the row.
+            # Each column represents a different character to align from,
+            # and thus a different length gap.
+            # Although the gap function does not distinguish between opening
+            # and extending a gap, we distinguish them for the backtrace.
+            if not penalize_end_gaps[0] and row == lenA:
+                row_open = score_matrix[row][col - 1]
+                row_extend = max(score_matrix[row][x] for x in range(col))
+            else:
+                row_open = score_matrix[row][col - 1] + gap_A_fn(row, 1)
+                row_extend = max(
+                    score_matrix[row][x] + gap_A_fn(row, col - x) for x in range(col)
+                )
+
+            # Try to find a better score by opening gaps in sequenceB.
+            if not penalize_end_gaps[1] and col == lenB:
+                col_open = score_matrix[row - 1][col]
+                col_extend = max(score_matrix[x][col] for x in range(row))
+            else:
+                col_open = score_matrix[row - 1][col] + gap_B_fn(col, 1)
+                col_extend = max(
+                    score_matrix[x][col] + gap_B_fn(col, row - x) for x in range(row)
+                )
+
+            best_score = max(nogap_score, row_open, row_extend, col_open, col_extend)
+            local_max_score = max(local_max_score, best_score)
+            if not align_globally and best_score < 0:
+                score_matrix[row][col] = 0.0
+            else:
+                score_matrix[row][col] = best_score
+
+            # The backtrace is encoded binary. See _make_score_matrix_fast
+            # for details.
+            if not score_only:
+                trace_score = 0
+                if rint(nogap_score) == rint(best_score):
+                    trace_score += 2
+                if rint(row_open) == rint(best_score):
+                    trace_score += 1
+                if rint(row_extend) == rint(best_score):
+                    trace_score += 8
+                if rint(col_open) == rint(best_score):
+                    trace_score += 4
+                if rint(col_extend) == rint(best_score):
+                    trace_score += 16
+                trace_matrix[row][col] = trace_score
+
+    if not align_globally:
+        best_score = local_max_score
+
+    return score_matrix, trace_matrix, best_score
+
+
+def _make_score_matrix_fast(
+    sequenceA,
+    sequenceB,
+    match_fn,
+    open_A,
+    extend_A,
+    open_B,
+    extend_B,
+    penalize_extend_when_opening,
+    penalize_end_gaps,
+    align_globally,
+    score_only,
+):
+    """Generate a score and traceback matrix according to Gotoh (PRIVATE).
+
+    This is an implementation of the Needleman-Wunsch dynamic programming
+    algorithm as modified by Gotoh, implementing affine gap penalties.
+    In short, we have three matrices, holding scores for alignments ending
+    in (1) a match/mismatch, (2) a gap in sequence A, and (3) a gap in
+    sequence B, respectively. However, we can combine them in one matrix,
+    which holds the best scores, and store only those values from the
+    other matrices that are actually used for the next step of calculation.
+    The traceback matrix holds the positions for backtracing the alignment.
+    """
+    first_A_gap = calc_affine_penalty(1, open_A, extend_A, penalize_extend_when_opening)
+    first_B_gap = calc_affine_penalty(1, open_B, extend_B, penalize_extend_when_opening)
+    local_max_score = 0
+
+    # Create the score and traceback matrices. These should be in the
+    # shape:
+    # sequenceA (down) x sequenceB (across)
+    lenA, lenB = len(sequenceA), len(sequenceB)
+    score_matrix, trace_matrix = [], []
+    for i in range(lenA + 1):
+        score_matrix.append([None] * (lenB + 1))
+        if not score_only:
+            trace_matrix.append([None] * (lenB + 1))
+
+    # Initialize first row and column with gap scores. This is like opening up
+    # i gaps at the beginning of sequence A or B.
+    for i in range(lenA + 1):
+        if penalize_end_gaps[1]:  # [1]:gap in sequence B
+            score = calc_affine_penalty(
+                i, open_B, extend_B, penalize_extend_when_opening
+            )
+        else:
+            score = 0
+        score_matrix[i][0] = score
+    for i in range(lenB + 1):
+        if penalize_end_gaps[0]:  # [0]:gap in sequence A
+            score = calc_affine_penalty(
+                i, open_A, extend_A, penalize_extend_when_opening
+            )
+        else:
+            score = 0
+        score_matrix[0][i] = score
+
+    # Now initialize the col 'matrix'. Actually this is only a one dimensional
+    # list, since we only need the col scores from the last row.
+    col_score = [0]  # Best score, if actual alignment ends with gap in seqB
+    for i in range(1, lenB + 1):
+        col_score.append(
+            calc_affine_penalty(i, 2 * open_B, extend_B, penalize_extend_when_opening)
+        )
+
+    # The row 'matrix' is calculated on the fly. Here we only need the actual
+    # score.
+    # Now, filling up the score and traceback matrices:
+    for row in range(1, lenA + 1):
+        row_score = calc_affine_penalty(
+            row, 2 * open_A, extend_A, penalize_extend_when_opening
+        )
+        for col in range(1, lenB + 1):
+            # Calculate the score that would occur by extending the
+            # alignment without gaps.
+            # fmt: off
+            nogap_score = (
+                score_matrix[row - 1][col - 1]
+                + match_fn(sequenceA[row - 1], sequenceB[col - 1])
+            )
+            # fmt: on
+            # Check the score that would occur if there were a gap in
+            # sequence A. This could come from opening a new gap or
+            # extending an existing one.
+            # A gap in sequence A can also be opened if it follows a gap in
+            # sequence B:  A-
+            #              -B
+            if not penalize_end_gaps[0] and row == lenA:
+                row_open = score_matrix[row][col - 1]
+                row_extend = row_score
+            else:
+                row_open = score_matrix[row][col - 1] + first_A_gap
+                row_extend = row_score + extend_A
+            row_score = max(row_open, row_extend)
+
+            # The same for sequence B:
+            if not penalize_end_gaps[1] and col == lenB:
+                col_open = score_matrix[row - 1][col]
+                col_extend = col_score[col]
+            else:
+                col_open = score_matrix[row - 1][col] + first_B_gap
+                col_extend = col_score[col] + extend_B
+            col_score[col] = max(col_open, col_extend)
+
+            best_score = max(nogap_score, col_score[col], row_score)
+            local_max_score = max(local_max_score, best_score)
+            if not align_globally and best_score < 0:
+                score_matrix[row][col] = 0
+            else:
+                score_matrix[row][col] = best_score
+
+            # Now the trace_matrix. The edges of the backtrace are encoded
+            # binary: 1 = open gap in seqA, 2 = match/mismatch of seqA and
+            # seqB, 4 = open gap in seqB, 8 = extend gap in seqA, and
+            # 16 = extend gap in seqB. This values can be summed up.
+            # Thus, the trace score 7 means that the best score can either
+            # come from opening a gap in seqA (=1), pairing two characters
+            # of seqA and seqB (+2=3) or opening a gap in seqB (+4=7).
+            # However, if we only want the score we don't care about the trace.
+            if not score_only:
+                row_score_rint = rint(row_score)
+                col_score_rint = rint(col_score[col])
+                row_trace_score = 0
+                col_trace_score = 0
+                if rint(row_open) == row_score_rint:
+                    row_trace_score += 1  # Open gap in seqA
+                if rint(row_extend) == row_score_rint:
+                    row_trace_score += 8  # Extend gap in seqA
+                if rint(col_open) == col_score_rint:
+                    col_trace_score += 4  # Open gap in seqB
+                if rint(col_extend) == col_score_rint:
+                    col_trace_score += 16  # Extend gap in seqB
+
+                trace_score = 0
+                best_score_rint = rint(best_score)
+                if rint(nogap_score) == best_score_rint:
+                    trace_score += 2  # Align seqA with seqB
+                if row_score_rint == best_score_rint:
+                    trace_score += row_trace_score
+                if col_score_rint == best_score_rint:
+                    trace_score += col_trace_score
+                trace_matrix[row][col] = trace_score
+
+    if not align_globally:
+        best_score = local_max_score
+
+    return score_matrix, trace_matrix, best_score
+
+
+def _recover_alignments(
+    sequenceA,
+    sequenceB,
+    starts,
+    best_score,
+    score_matrix,
+    trace_matrix,
+    align_globally,
+    gap_char,
+    one_alignment_only,
+    gap_A_fn,
+    gap_B_fn,
+    reverse=False,
+):
+    """Do the backtracing and return a list of alignments (PRIVATE).
+
+    Recover the alignments by following the traceback matrix.  This
+    is a recursive procedure, but it's implemented here iteratively
+    with a stack.
+
+    sequenceA and sequenceB may be sequences, including strings,
+    lists, or list-like objects.  In order to preserve the type of
+    the object, we need to use slices on the sequences instead of
+    indexes.  For example, sequenceA[row] may return a type that's
+    not compatible with sequenceA, e.g. if sequenceA is a list and
+    sequenceA[row] is a string.  Thus, avoid using indexes and use
+    slices, e.g. sequenceA[row:row+1].  Assume that client-defined
+    sequence classes preserve these semantics.
+    """
+    lenA, lenB = len(sequenceA), len(sequenceB)
+    ali_seqA, ali_seqB = sequenceA[0:0], sequenceB[0:0]
+    tracebacks = []
+    in_process = []
+
+    for start in starts:
+        score, (row, col) = start
+        begin = 0
+        if align_globally:
+            end = None
+        else:
+            # If this start is a zero-extension: don't start here!
+            if (score, (row - 1, col - 1)) in starts:
+                continue
+            # Local alignments should start with a positive score!
+            if score <= 0:
+                continue
+            # Local alignments should not end with a gap!:
+            trace = trace_matrix[row][col]
+            if (trace - trace % 2) % 4 == 2:  # Trace contains 'nogap', fine!
+                trace_matrix[row][col] = 2
+            # If not, don't start here!
+            else:
+                continue
+            end = -max(lenA - row, lenB - col)
+            if not end:
+                end = None
+            col_distance = lenB - col
+            row_distance = lenA - row
+
+            # fmt: off
+            ali_seqA = (
+                (col_distance - row_distance) * gap_char
+                + sequenceA[lenA - 1 : row - 1 : -1]
+            )
+            ali_seqB = (
+                (row_distance - col_distance) * gap_char
+                + sequenceB[lenB - 1 : col - 1 : -1]
+            )
+            # fmt: on
+        in_process += [
+            (ali_seqA, ali_seqB, end, row, col, False, trace_matrix[row][col])
+        ]
+    while in_process and len(tracebacks) < MAX_ALIGNMENTS:
+        # Although we allow a gap in seqB to be followed by a gap in seqA,
+        # we don't want to allow it the other way round, since this would
+        # give redundant alignments of type: A-  vs.  -A
+        #                                    -B       B-
+        # Thus we need to keep track if a gap in seqA was opened (col_gap)
+        # and stop the backtrace (dead_end) if a gap in seqB follows.
+        #
+        # Attention: This may fail, if the gap-penalties for both strands are
+        # different. In this case the second alignment may be the only optimal
+        # alignment. Thus it can happen that no alignment is returned. For
+        # this case a workaround was implemented, which reverses the input and
+        # the matrices (this happens in _reverse_matrices) and repeats the
+        # backtrace. The variable 'reverse' keeps track of this.
+        dead_end = False
+        ali_seqA, ali_seqB, end, row, col, col_gap, trace = in_process.pop()
+
+        while (row > 0 or col > 0) and not dead_end:
+            cache = (ali_seqA[:], ali_seqB[:], end, row, col, col_gap)
+
+            # If trace is empty we have reached at least one border of the
+            # matrix or the end of a local alignment. Just add the rest of
+            # the sequence(s) and fill with gaps if necessary.
+            if not trace:
+                if col and col_gap:
+                    dead_end = True
+                else:
+                    ali_seqA, ali_seqB = _finish_backtrace(
+                        sequenceA, sequenceB, ali_seqA, ali_seqB, row, col, gap_char
+                    )
+                break
+            elif trace % 2 == 1:  # = row open = open gap in seqA
+                trace -= 1
+                if col_gap:
+                    dead_end = True
+                else:
+                    col -= 1
+                    ali_seqA += gap_char
+                    ali_seqB += sequenceB[col : col + 1]
+                    col_gap = False
+            elif trace % 4 == 2:  # = match/mismatch of seqA with seqB
+                trace -= 2
+                row -= 1
+                col -= 1
+                ali_seqA += sequenceA[row : row + 1]
+                ali_seqB += sequenceB[col : col + 1]
+                col_gap = False
+            elif trace % 8 == 4:  # = col open = open gap in seqB
+                trace -= 4
+                row -= 1
+                ali_seqA += sequenceA[row : row + 1]
+                ali_seqB += gap_char
+                col_gap = True
+            elif trace in (8, 24):  # = row extend = extend gap in seqA
+                trace -= 8
+                if col_gap:
+                    dead_end = True
+                else:
+                    col_gap = False
+                    # We need to find the starting point of the extended gap
+                    x = _find_gap_open(
+                        sequenceA,
+                        sequenceB,
+                        ali_seqA,
+                        ali_seqB,
+                        end,
+                        row,
+                        col,
+                        col_gap,
+                        gap_char,
+                        score_matrix,
+                        trace_matrix,
+                        in_process,
+                        gap_A_fn,
+                        col,
+                        row,
+                        "col",
+                        best_score,
+                        align_globally,
+                    )
+                    ali_seqA, ali_seqB, row, col, in_process, dead_end = x
+            elif trace == 16:  # = col extend = extend gap in seqB
+                trace -= 16
+                col_gap = True
+                x = _find_gap_open(
+                    sequenceA,
+                    sequenceB,
+                    ali_seqA,
+                    ali_seqB,
+                    end,
+                    row,
+                    col,
+                    col_gap,
+                    gap_char,
+                    score_matrix,
+                    trace_matrix,
+                    in_process,
+                    gap_B_fn,
+                    row,
+                    col,
+                    "row",
+                    best_score,
+                    align_globally,
+                )
+                ali_seqA, ali_seqB, row, col, in_process, dead_end = x
+
+            if trace:  # There is another path to follow...
+                cache += (trace,)
+                in_process.append(cache)
+            trace = trace_matrix[row][col]
+            if not align_globally:
+                if score_matrix[row][col] == best_score:
+                    # We have gone through a 'zero-score' extension, discard it
+                    dead_end = True
+                elif score_matrix[row][col] <= 0:
+                    # We have reached the end of the backtrace
+                    begin = max(row, col)
+                    trace = 0
+        if not dead_end:
+            if not reverse:
+                tracebacks.append((ali_seqA[::-1], ali_seqB[::-1], score, begin, end))
+            else:
+                tracebacks.append((ali_seqB[::-1], ali_seqA[::-1], score, begin, end))
+            if one_alignment_only:
+                break
+    return _clean_alignments(tracebacks)
+
+
+def _find_start(score_matrix, best_score, align_globally):
+    """Return a list of starting points (score, (row, col)) (PRIVATE).
+
+    Indicating every possible place to start the tracebacks.
+    """
+    nrows, ncols = len(score_matrix), len(score_matrix[0])
+    # In this implementation of the global algorithm, the start will always be
+    # the bottom right corner of the matrix.
+    if align_globally:
+        starts = [(best_score, (nrows - 1, ncols - 1))]
+    else:
+        # For local alignments, there may be many different start points.
+        starts = []
+        tolerance = 0  # XXX do anything with this?
+        # Now find all the positions within some tolerance of the best
+        # score.
+        for row in range(nrows):
+            for col in range(ncols):
+                score = score_matrix[row][col]
+                if rint(abs(score - best_score)) <= rint(tolerance):
+                    starts.append((score, (row, col)))
+    return starts
+
+
+def _reverse_matrices(score_matrix, trace_matrix):
+    """Reverse score and trace matrices (PRIVATE)."""
+    reverse_score_matrix = []
+    reverse_trace_matrix = []
+    # fmt: off
+    reverse_trace = {
+        1: 4, 2: 2, 3: 6, 4: 1, 5: 5, 6: 3, 7: 7, 8: 16, 9: 20, 10: 18, 11: 22, 12: 17,
+        13: 21, 14: 19, 15: 23, 16: 8, 17: 12, 18: 10, 19: 14, 20: 9, 21: 13, 22: 11,
+        23: 15, 24: 24, 25: 28, 26: 26, 27: 30, 28: 25, 29: 29, 30: 27, 31: 31,
+        None: None,
+    }
+    # fmt: on
+    for col in range(len(score_matrix[0])):
+        new_score_row = []
+        new_trace_row = []
+        for row in range(len(score_matrix)):
+            new_score_row.append(score_matrix[row][col])
+            new_trace_row.append(reverse_trace[trace_matrix[row][col]])
+        reverse_score_matrix.append(new_score_row)
+        reverse_trace_matrix.append(new_trace_row)
+    return reverse_score_matrix, reverse_trace_matrix
+
+
+def _clean_alignments(alignments):
+    """Take a list of alignments and return a cleaned version (PRIVATE).
+
+    Remove duplicates, make sure begin and end are set correctly, remove
+    empty alignments.
+    """
+    Alignment = namedtuple("Alignment", ("seqA, seqB, score, start, end"))
+    unique_alignments = []
+    for align in alignments:
+        if align not in unique_alignments:
+            unique_alignments.append(align)
+    i = 0
+    while i < len(unique_alignments):
+        seqA, seqB, score, begin, end = unique_alignments[i]
+        # Make sure end is set reasonably.
+        if end is None:  # global alignment
+            end = len(seqA)
+        elif end < 0:
+            end = end + len(seqA)
+        # If there's no alignment here, get rid of it.
+        if begin >= end:
+            del unique_alignments[i]
+            continue
+        unique_alignments[i] = Alignment(seqA, seqB, score, begin, end)
+        i += 1
+    return unique_alignments
+
+
+def _finish_backtrace(sequenceA, sequenceB, ali_seqA, ali_seqB, row, col, gap_char):
+    """Add remaining sequences and fill with gaps if necessary (PRIVATE)."""
+    if row:
+        ali_seqA += sequenceA[row - 1 :: -1]
+    if col:
+        ali_seqB += sequenceB[col - 1 :: -1]
+    if row > col:
+        ali_seqB += gap_char * (len(ali_seqA) - len(ali_seqB))
+    elif col > row:
+        ali_seqA += gap_char * (len(ali_seqB) - len(ali_seqA))
+    return ali_seqA, ali_seqB
+
+
+def _find_gap_open(
+    sequenceA,
+    sequenceB,
+    ali_seqA,
+    ali_seqB,
+    end,
+    row,
+    col,
+    col_gap,
+    gap_char,
+    score_matrix,
+    trace_matrix,
+    in_process,
+    gap_fn,
+    target,
+    index,
+    direction,
+    best_score,
+    align_globally,
+):
+    """Find the starting point(s) of the extended gap (PRIVATE)."""
+    dead_end = False
+    target_score = score_matrix[row][col]
+    for n in range(target):
+        if direction == "col":
+            col -= 1
+            ali_seqA += gap_char
+            ali_seqB += sequenceB[col : col + 1]
+        else:
+            row -= 1
+            ali_seqA += sequenceA[row : row + 1]
+            ali_seqB += gap_char
+        actual_score = score_matrix[row][col] + gap_fn(index, n + 1)
+        if not align_globally and score_matrix[row][col] == best_score:
+            # We have run through a 'zero-score' extension and discard it
+            dead_end = True
+            break
+        if rint(actual_score) == rint(target_score) and n > 0:
+            if not trace_matrix[row][col]:
+                break
+            else:
+                in_process.append(
+                    (
+                        ali_seqA[:],
+                        ali_seqB[:],
+                        end,
+                        row,
+                        col,
+                        col_gap,
+                        trace_matrix[row][col],
+                    )
+                )
+        if not trace_matrix[row][col]:
+            dead_end = True
+    return ali_seqA, ali_seqB, row, col, in_process, dead_end
+
+
+_PRECISION = 1000
+
+
+def rint(x, precision=_PRECISION):
+    """Print number with declared precision."""
+    return int(x * precision + 0.5)
+
+
+class identity_match:
+    """Create a match function for use in an alignment.
+
+    match and mismatch are the scores to give when two residues are equal
+    or unequal.  By default, match is 1 and mismatch is 0.
+    """
+
+    def __init__(self, match=1, mismatch=0):
+        """Initialize the class."""
+        self.match = match
+        self.mismatch = mismatch
+
+    def __call__(self, charA, charB):
+        """Call a match function instance already created."""
+        if charA == charB:
+            return self.match
+        return self.mismatch
+
+
+class dictionary_match:
+    """Create a match function for use in an alignment.
+
+    Attributes:
+     - score_dict     - A dictionary where the keys are tuples (residue 1,
+       residue 2) and the values are the match scores between those residues.
+     - symmetric      - A flag that indicates whether the scores are symmetric.
+
+    """
+
+    def __init__(self, score_dict, symmetric=1):
+        """Initialize the class."""
+        self.score_dict = score_dict
+        self.symmetric = symmetric
+
+    def __call__(self, charA, charB):
+        """Call a dictionary match instance already created."""
+        if self.symmetric and (charA, charB) not in self.score_dict:
+            # If the score dictionary is symmetric, then look up the
+            # score both ways.
+            charB, charA = charA, charB
+        return self.score_dict[(charA, charB)]
+
+
+class affine_penalty:
+    """Create a gap function for use in an alignment."""
+
+    def __init__(self, open, extend, penalize_extend_when_opening=0):
+        """Initialize the class."""
+        if open > 0 or extend > 0:
+            raise ValueError("Gap penalties should be non-positive.")
+        if not penalize_extend_when_opening and (extend < open):
+            raise ValueError(
+                "Gap opening penalty should be higher than "
+                "gap extension penalty (or equal)"
+            )
+        self.open, self.extend = open, extend
+        self.penalize_extend_when_opening = penalize_extend_when_opening
+
+    def __call__(self, index, length):
+        """Call a gap function instance already created."""
+        return calc_affine_penalty(
+            length, self.open, self.extend, self.penalize_extend_when_opening
+        )
+
+
+def calc_affine_penalty(length, open, extend, penalize_extend_when_opening):
+    """Calculate a penality score for the gap function."""
+    if length <= 0:
+        return 0.0
+    penalty = open + extend * length
+    if not penalize_extend_when_opening:
+        penalty -= extend
+    return penalty
+
+
+def print_matrix(matrix):
+    """Print out a matrix for debugging purposes."""
+    # Transpose the matrix and get the length of the values in each column.
+    matrixT = [[] for x in range(len(matrix[0]))]
+    for i in range(len(matrix)):
+        for j in range(len(matrix[i])):
+            matrixT[j].append(len(str(matrix[i][j])))
+    ndigits = [max(x) for x in matrixT]
+    for i in range(len(matrix)):
+        # Using string formatting trick to add leading spaces,
+        print(
+            " ".join("%*s " % (ndigits[j], matrix[i][j]) for j in range(len(matrix[i])))
+        )
+
+
+def format_alignment(align1, align2, score, begin, end, full_sequences=False):
+    """Format the alignment prettily into a string.
+
+    IMPORTANT: Gap symbol must be "-" (or ['-'] for lists)!
+
+    Since Biopython 1.71 identical matches are shown with a pipe
+    character, mismatches as a dot, and gaps as a space.
+
+    Prior releases just used the pipe character to indicate the
+    aligned region (matches, mismatches and gaps).
+
+    Also, in local alignments, if the alignment does not include
+    the whole sequences, now only the aligned part is shown,
+    together with the start positions of the aligned subsequences.
+    The start positions are 1-based; so start position n is the
+    n-th base/amino acid in the *un-aligned* sequence.
+
+    NOTE: This is different to the alignment's begin/end values,
+    which give the Python indices (0-based) of the bases/amino acids
+    in the *aligned* sequences.
+
+    If you want to restore the 'historic' behaviour, that means
+    displaying the whole sequences (including the non-aligned parts),
+    use ``full_sequences=True``. In this case, the non-aligned leading
+    and trailing parts are also indicated by spaces in the match-line.
+    """
+    align_begin = begin
+    align_end = end
+    start1 = start2 = ""
+    start_m = begin  # Begin of match line (how many spaces to include)
+    # For local alignments:
+    if not full_sequences and (begin != 0 or end != len(align1)):
+        # Calculate the actual start positions in the un-aligned sequences
+        # This will only work if the gap symbol is '-' or ['-']!
+        start1 = str(len(align1[:begin]) - align1[:begin].count("-") + 1) + " "
+        start2 = str(len(align2[:begin]) - align2[:begin].count("-") + 1) + " "
+        start_m = max(len(start1), len(start2))
+    elif full_sequences:
+        start_m = 0
+        begin = 0
+        end = len(align1)
+
+    if isinstance(align1, list):
+        # List elements will be separated by spaces, since they can be
+        # of different lengths
+        align1 = [a + " " for a in align1]
+        align2 = [a + " " for a in align2]
+
+    s1_line = ["{:>{width}}".format(start1, width=start_m)]  # seq1 line
+    m_line = [" " * start_m]  # match line
+    s2_line = ["{:>{width}}".format(start2, width=start_m)]  # seq2 line
+
+    for n, (a, b) in enumerate(zip(align1[begin:end], align2[begin:end])):
+        # Since list elements can be of different length, we center them,
+        # using the maximum length of the two compared elements as width
+        m_len = max(len(a), len(b))
+        s1_line.append("{:^{width}}".format(a, width=m_len))
+        s2_line.append("{:^{width}}".format(b, width=m_len))
+        if full_sequences and (n < align_begin or n >= align_end):
+            m_line.append("{:^{width}}".format(" ", width=m_len))  # space
+            continue
+        if a == b:
+            m_line.append("{:^{width}}".format("|", width=m_len))  # match
+        elif a.strip() == "-" or b.strip() == "-":
+            m_line.append("{:^{width}}".format(" ", width=m_len))  # gap
+        else:
+            m_line.append("{:^{width}}".format(".", width=m_len))  # mismatch
+
+    s2_line.append("\n  Score=%g\n" % score)
+    return "\n".join(["".join(s1_line), "".join(m_line), "".join(s2_line)])
+
+
+# Try and load C implementations of functions. If I can't,
+# then throw a warning and use the pure Python implementations.
+# The redefinition is deliberate, thus the no quality assurance
+# flag for when using flake8.
+# Before, we secure access to the pure Python functions (for testing purposes):
+
+_python_make_score_matrix_fast = _make_score_matrix_fast
+_python_rint = rint
+
+try:
+    from .cpairwise2 import rint, _make_score_matrix_fast  # noqa
+except ImportError:
+    warnings.warn(
+        "Import of C module failed. Falling back to pure Python "
+        "implementation. This may be slooow...",
+        BiopythonWarning,
+    )
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest()
diff --git a/code/lib/Bio/phenotype/__init__.py b/code/lib/Bio/phenotype/__init__.py
new file mode 100644
index 0000000..636b843
--- /dev/null
+++ b/code/lib/Bio/phenotype/__init__.py
@@ -0,0 +1,241 @@
+# Copyright 2014-2016 by Marco Galardini.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+r"""phenotype data input/output.
+
+Input
+=====
+The main function is Bio.phenotype.parse(...) which takes an input file,
+and format string.  This returns an iterator giving PlateRecord objects:
+
+    >>> from Bio import phenotype
+    >>> for record in phenotype.parse("phenotype/Plates.csv", "pm-csv"):
+    ...     print("%s %i" % (record.id, len(record)))
+    ...
+    PM01 96
+    PM09 96
+
+Note that the parse() function will invoke the relevant parser for the
+format with its default settings.  You may want more control, in which case
+you need to create a format specific sequence iterator directly.
+
+Input - Single Records
+======================
+If you expect your file to contain one-and-only-one record, then we provide
+the following 'helper' function which will return a single PlateRecord, or
+raise an exception if there are no records or more than one record:
+
+    >>> from Bio import phenotype
+    >>> record = phenotype.read("phenotype/Plate.json", "pm-json")
+    >>> print("%s %i" % (record.id, len(record)))
+    PM01 96
+
+This style is useful when you expect a single record only (and would
+consider multiple records an error).  For example, when dealing with PM
+JSON files saved by the opm library.
+
+However, if you just want the first record from a file containing multiple
+record, use the next() function on the iterator:
+
+    >>> from Bio import phenotype
+    >>> record = next(phenotype.parse("phenotype/Plates.csv", "pm-csv"))
+    >>> print("%s %i" % (record.id, len(record)))
+    PM01 96
+
+The above code will work as long as the file contains at least one record.
+Note that if there is more than one record, the remaining records will be
+silently ignored.
+
+Output
+======
+Use the function Bio.phenotype.write(...), which takes a complete set of
+PlateRecord objects (either as a list, or an iterator), an output file handle
+(or in recent versions of Biopython an output filename as a string) and of
+course the file format::
+
+        from Bio import phenotype
+        records = ...
+        phenotype.write(records, "example.json", "pm-json")
+
+Or, using a handle::
+
+        from Bio import phenotype
+        records = ...
+        with open("example.json", "w") as handle:
+           phenotype.write(records, handle, "pm-json")
+
+You are expected to call this function once (with all your records) and if
+using a handle, make sure you close it to flush the data to the hard disk.
+
+
+File Formats
+============
+When specifying the file format, use lowercase strings.
+
+ - pm-json - Phenotype Microarray plates in JSON format.
+ - pm-csv  - Phenotype Microarray plates in CSV format, which is the
+             machine vendor format
+
+Note that while Bio.phenotype can read the above file formats, it can only
+write in JSON format.
+"""
+
+from Bio import BiopythonExperimentalWarning
+from Bio.File import as_handle
+from . import phen_micro
+
+import warnings
+
+
+warnings.warn(
+    "Bio.phenotype is an experimental submodule which may undergo "
+    "significant changes prior to its future official release.",
+    BiopythonExperimentalWarning,
+)
+
+# Convention for format names is "mainname-format" in lower case.
+
+_FormatToIterator = {
+    "pm-csv": phen_micro.CsvIterator,
+    "pm-json": phen_micro.JsonIterator,
+}
+
+_FormatToWriter = {"pm-json": phen_micro.JsonWriter}
+
+
+def write(plates, handle, format):
+    """Write complete set of PlateRecords to a file.
+
+     - plates    - A list (or iterator) of PlateRecord objects.
+     - handle    - File handle object to write to, or filename as string
+                   (note older versions of Biopython only took a handle).
+     - format    - lower case string describing the file format to write.
+
+    You should close the handle after calling this function.
+
+    Returns the number of records written (as an integer).
+    """
+    # Try and give helpful error messages:
+    if not isinstance(format, str):
+        raise TypeError("Need a string for the file format (lower case)")
+    if not format:
+        raise ValueError("Format required (lower case string)")
+    if format != format.lower():
+        raise ValueError("Format string '%s' should be lower case" % format)
+
+    if isinstance(plates, phen_micro.PlateRecord):
+        plates = [plates]
+
+    with as_handle(handle, "w") as fp:
+        # Map the file format to a writer class
+        if format in _FormatToWriter:
+            writer_class = _FormatToWriter[format]
+            count = writer_class(plates).write(fp)
+        else:
+            raise ValueError("Unknown format '%s'" % format)
+
+        if not isinstance(count, int):
+            raise TypeError(
+                "Internal error - the underlying %s "
+                "writer should have returned the record count, not %r" % (format, count)
+            )
+
+    return count
+
+
+def parse(handle, format):
+    """Turn a phenotype file into an iterator returning PlateRecords.
+
+     - handle   - handle to the file, or the filename as a string
+                  (note older versions of Biopython only took a handle).
+     - format   - lower case string describing the file format.
+
+    Typical usage, opening a file to read in, and looping over the record(s):
+
+    >>> from Bio import phenotype
+    >>> filename = "phenotype/Plates.csv"
+    >>> for record in phenotype.parse(filename, "pm-csv"):
+    ...    print("ID %s" % record.id)
+    ...    print("Number of wells %i" % len(record))
+    ...
+    ID PM01
+    Number of wells 96
+    ID PM09
+    Number of wells 96
+
+    Use the Bio.phenotype.read(...) function when you expect a single record
+    only.
+    """
+    # Try and give helpful error messages:
+    if not isinstance(format, str):
+        raise TypeError("Need a string for the file format (lower case)")
+    if not format:
+        raise ValueError("Format required (lower case string)")
+    if format != format.lower():
+        raise ValueError("Format string '%s' should be lower case" % format)
+
+    with as_handle(handle) as fp:
+        # Map the file format to a sequence iterator:
+        if format in _FormatToIterator:
+            iterator_generator = _FormatToIterator[format]
+            i = iterator_generator(fp)
+        else:
+            raise ValueError("Unknown format '%s'" % format)
+        yield from i
+
+
+def read(handle, format):
+    """Turn a phenotype file into a single PlateRecord.
+
+     - handle   - handle to the file, or the filename as a string
+                  (note older versions of Biopython only took a handle).
+     - format   - string describing the file format.
+
+    This function is for use parsing phenotype files containing
+    exactly one record.  For example, reading a PM JSON file:
+
+    >>> from Bio import phenotype
+    >>> record = phenotype.read("phenotype/Plate.json", "pm-json")
+    >>> print("ID %s" % record.id)
+    ID PM01
+    >>> print("Number of wells %i" % len(record))
+    Number of wells 96
+
+    If the handle contains no records, or more than one record,
+    an exception is raised.  For example::
+
+        from Bio import phenotype
+        record = phenotype.read("plates.csv", "pm-csv")
+        Traceback (most recent call last):
+        ...
+        ValueError: More than one record found in handle
+
+    If however you want the first record from a file containing
+    multiple records this function would raise an exception (as
+    shown in the example above).  Instead use:
+
+    >>> from Bio import phenotype
+    >>> record = next(phenotype.parse("phenotype/Plates.csv", "pm-csv"))
+    >>> print("First record's ID %s" % record.id)
+    First record's ID PM01
+
+    Use the Bio.phenotype.parse(handle, format) function if you want
+    to read multiple records from the handle.
+    """
+    iterator = parse(handle, format)
+    try:
+        first = next(iterator)
+    except StopIteration:
+        first = None
+    if first is None:
+        raise ValueError("No records found in handle")
+    try:
+        second = next(iterator)
+    except StopIteration:
+        second = None
+    if second is not None:
+        raise ValueError("More than one record found in handle")
+    return first
diff --git a/code/lib/Bio/phenotype/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/phenotype/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..e7c934c
Binary files /dev/null and b/code/lib/Bio/phenotype/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/phenotype/__pycache__/phen_micro.cpython-37.pyc b/code/lib/Bio/phenotype/__pycache__/phen_micro.cpython-37.pyc
new file mode 100644
index 0000000..56401ff
Binary files /dev/null and b/code/lib/Bio/phenotype/__pycache__/phen_micro.cpython-37.pyc differ
diff --git a/code/lib/Bio/phenotype/__pycache__/pm_fitting.cpython-37.pyc b/code/lib/Bio/phenotype/__pycache__/pm_fitting.cpython-37.pyc
new file mode 100644
index 0000000..cca8c12
Binary files /dev/null and b/code/lib/Bio/phenotype/__pycache__/pm_fitting.cpython-37.pyc differ
diff --git a/code/lib/Bio/phenotype/phen_micro.py b/code/lib/Bio/phenotype/phen_micro.py
new file mode 100644
index 0000000..93af24f
--- /dev/null
+++ b/code/lib/Bio/phenotype/phen_micro.py
@@ -0,0 +1,1207 @@
+# Copyright 2014-2016 by Marco Galardini.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Classes to work with Phenotype Microarray data.
+
+More information on the single plates can be found here: http://www.biolog.com/
+
+Classes:
+ - PlateRecord - Object that contain time course data on each well of the
+   plate, as well as metadata (if any).
+ - WellRecord - Object that contains the time course data of a single well
+ - JsonWriter - Writer of PlateRecord objects in JSON format.
+
+Functions:
+ - JsonIterator -  Incremental PM JSON parser, this is an iterator that returns
+   PlateRecord objects.
+ - CsvIterator - Incremental PM CSV parser, this is an iterator that returns
+   PlateRecord objects.
+ - _toOPM - Used internally by JsonWriter, converts PlateRecord objects in
+   dictionaries ready to be serialized in JSON format.
+
+"""
+
+import warnings
+import json
+import csv
+import numpy as np
+
+from Bio import BiopythonParserWarning
+
+# Private csv headers - hardcoded because this are supposedly never changed
+_datafile = "Data File"
+_plate = "Plate Type"
+_strainType = "Strain Type"
+_sample = "Sample Number"
+_strainName = "Strain Name"
+_strainNumber = "Strain Number"
+_other = "Other"
+_hour = "Hour"
+_file = "File"
+_position = "Position"
+_setupTime = "Setup Time"
+
+_platesPrefix = "PM"
+_platesPrefixMammalian = "PM-M"
+#
+
+# Json identifiers - hardcoded as they are set by the creators of opm
+_csvData = "csv_data"
+_measurements = "measurements"
+#
+
+
+class PlateRecord:
+    """PlateRecord object for storing Phenotype Microarray plates data.
+
+    A PlateRecord stores all the wells of a particular phenotype
+    Microarray plate, along with metadata (if any). The single wells can be
+    accessed calling their id as an index or iterating on the PlateRecord:
+
+    >>> from Bio import phenotype
+    >>> plate = phenotype.read("phenotype/Plate.json", "pm-json")
+    >>> well = plate['A05']
+    >>> for well in plate:
+    ...    print(well.id)
+    ...
+    A01
+    ...
+
+    The plate rows and columns can be queried with an indexing system similar
+    to NumPy and other matrices:
+
+    >>> print(plate[1])
+    Plate ID: PM01
+    Well: 12
+    Rows: 1
+    Columns: 12
+    PlateRecord('WellRecord['B01'], WellRecord['B02'], WellRecord['B03'], ..., WellRecord['B12']')
+
+    >>> print(plate[:,1])
+    Plate ID: PM01
+    Well: 8
+    Rows: 8
+    Columns: 1
+    PlateRecord('WellRecord['A02'], WellRecord['B02'], WellRecord['C02'], ..., WellRecord['H02']')
+
+    Single WellRecord objects can be accessed using this indexing system:
+
+    >>> print(plate[1,2])
+    Plate ID: PM01
+    Well ID: B03
+    Time points: 384
+    Minum signal 0.00 at time 11.00
+    Maximum signal 76.25 at time 18.00
+    WellRecord('(0.0, 11.0), (0.25, 11.0), (0.5, 11.0), (0.75, 11.0), (1.0, 11.0), ..., (95.75, 11.0)')
+
+    The presence of a particular well can be inspected with the "in" keyword:
+    >>> 'A01' in plate
+    True
+
+    All the wells belonging to a "row" (identified by the first character of
+    the well id) in the plate can be obtained:
+
+    >>> for well in plate.get_row('H'):
+    ...     print(well.id)
+    ...
+    H01
+    H02
+    H03
+    ...
+
+    All the wells belonging to a "column" (identified by the number of the well)
+    in the plate can be obtained:
+
+    >>> for well in plate.get_column(12):
+    ...     print(well.id)
+    ...
+    A12
+    B12
+    C12
+    ...
+
+    Two PlateRecord objects can be compared: if all their wells are equal the
+    two plates are considered equal:
+
+    >>> plate2 = phenotype.read("phenotype/Plate.json", "pm-json")
+    >>> plate == plate2
+    True
+
+    Two PlateRecord object can be summed up or subracted from each other: the
+    the signals of each well will be summed up or subtracted. The id of the
+    left operand will be kept:
+
+    >>> plate3 = plate + plate2
+    >>> print(plate3.id)
+    PM01
+
+    Many Phenotype Microarray plate have a "negative control" well, which can
+    be subtracted to all wells:
+
+    >>> subplate = plate.subtract_control()
+
+    """
+
+    def __init__(self, plateid, wells=None):
+        """Initialize the class."""
+        self.id = plateid
+
+        if wells is None:
+            wells = []
+
+        # Similar behaviour as GenBank
+        # Contains all the attributes
+        self.qualifiers = {}
+
+        # Well_id --> WellRecord objects
+        self._wells = {}
+        try:
+            for w in wells:
+                self._is_well(w)
+                self[w.id] = w
+        except TypeError:
+            raise TypeError(
+                "You must provide an iterator-like object containing the single wells"
+            )
+
+        self._update()
+
+    def _update(self):
+        """Update the rows and columns string identifiers (PRIVATE)."""
+        self._rows = sorted({x[0] for x in self._wells})
+        self._columns = sorted({x[1:] for x in self._wells})
+
+    def _is_well(self, obj):
+        """Check if the given object is a WellRecord object (PRIVATE).
+
+        Used both for the class constructor and the __setitem__ method
+        """
+        # Value should be of WellRecord type
+        if not isinstance(obj, WellRecord):
+            raise ValueError(
+                "A WellRecord type object is needed as value (got %s)" % type(obj)
+            )
+
+    def __getitem__(self, index):
+        """Access part of the plate.
+
+        Depending on the indices, you can get a WellRecord object
+        (representing a single well of the plate),
+        or another plate
+        (representing some part or all of the original plate).
+
+        plate[wid] gives a WellRecord (if wid is a WellRecord id)
+        plate[r,c] gives a WellRecord
+        plate[r] gives a row as a PlateRecord
+        plate[r,:] gives a row as a PlateRecord
+        plate[:,c] gives a column as a PlateRecord
+
+        plate[:] and plate[:,:] give a copy of the plate
+
+        Anything else gives a subset of the original plate, e.g.
+        plate[0:2] or plate[0:2,:] uses only row 0 and 1
+        plate[:,1:3] uses only columns 1 and 2
+        plate[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2
+
+        >>> from Bio import phenotype
+        >>> plate = phenotype.read("phenotype/Plate.json", "pm-json")
+
+        You can access a well of the plate, using its id.
+
+        >>> w = plate['A01']
+
+        You can access a row of the plate as a PlateRecord using an integer
+        index:
+
+        >>> first_row = plate[0]
+        >>> print(first_row)
+        Plate ID: PM01
+        Well: 12
+        Rows: 1
+        Columns: 12
+        PlateRecord('WellRecord['A01'], WellRecord['A02'], WellRecord['A03'], ..., WellRecord['A12']')
+        >>> last_row = plate[-1]
+        >>> print(last_row)
+        Plate ID: PM01
+        Well: 12
+        Rows: 1
+        Columns: 12
+        PlateRecord('WellRecord['H01'], WellRecord['H02'], WellRecord['H03'], ..., WellRecord['H12']')
+
+        You can also access use python's slice notation to sub-plates
+        containing only some of the plate rows:
+
+        >>> sub_plate = plate[2:5]
+        >>> print(sub_plate)
+        Plate ID: PM01
+        Well: 36
+        Rows: 3
+        Columns: 12
+        PlateRecord('WellRecord['C01'], WellRecord['C02'], WellRecord['C03'], ..., WellRecord['E12']')
+
+        This includes support for a step, i.e. plate[start:end:step], which
+        can be used to select every second row:
+
+        >>> sub_plate = plate[::2]
+
+        You can also use two indices to specify both rows and columns.
+        Using simple integers gives you the single wells. e.g.
+
+        >>> w = plate[3, 4]
+        >>> print(w.id)
+        D05
+
+        To get a single column use this syntax:
+
+        >>> sub_plate = plate[:, 4]
+        >>> print(sub_plate)
+        Plate ID: PM01
+        Well: 8
+        Rows: 8
+        Columns: 1
+        PlateRecord('WellRecord['A05'], WellRecord['B05'], WellRecord['C05'], ..., WellRecord['H05']')
+
+        Or, to get part of a column,
+
+        >>> sub_plate = plate[1:3, 4]
+        >>> print(sub_plate)
+        Plate ID: PM01
+        Well: 2
+        Rows: 2
+        Columns: 1
+        PlateRecord(WellRecord['B05'], WellRecord['C05'])
+
+        However, in general you get a sub-plate,
+
+        >>> print(plate[1:5, 3:6])
+        Plate ID: PM01
+        Well: 12
+        Rows: 4
+        Columns: 3
+        PlateRecord('WellRecord['B04'], WellRecord['B05'], WellRecord['B06'], ..., WellRecord['E06']')
+
+        This should all seem familiar to anyone who has used the NumPy
+        array or matrix objects.
+        """
+        # Well identifier access
+        if isinstance(index, str):
+            try:
+                return self._wells[index]
+            except KeyError:
+                raise KeyError("Well %s not found!" % index)
+
+        # Integer index
+        elif isinstance(index, int):
+            try:
+                row = self._rows[index]
+            except IndexError:
+                raise IndexError("Row %d not found!" % index)
+            return PlateRecord(
+                self.id, filter(lambda x: x.id.startswith(row), self._wells.values())
+            )
+
+        # Slice
+        elif isinstance(index, slice):
+            rows = self._rows[index]
+            return PlateRecord(
+                self.id, filter(lambda x: x.id[0] in rows, self._wells.values())
+            )
+
+        # Other access
+        elif len(index) != 2:
+            raise TypeError("Invalid index type.")
+
+        row_index, col_index = index
+        if isinstance(row_index, int) and isinstance(col_index, int):
+            # Return a single WellRecord
+            try:
+                row = self._rows[row_index]
+            except IndexError:
+                raise IndexError("Row %d not found!" % row_index)
+            try:
+                col = self._columns[col_index]
+            except IndexError:
+                raise IndexError("Column %d not found!" % col_index)
+
+            return self._wells[row + col]
+
+        elif isinstance(row_index, int):
+            try:
+                row = self._rows[row_index]
+            except IndexError:
+                raise IndexError("Row %d not found!" % row_index)
+            cols = self._columns[col_index]
+
+            return PlateRecord(
+                self.id,
+                filter(
+                    lambda x: x.id.startswith(row) and x.id[1:] in cols,
+                    self._wells.values(),
+                ),
+            )
+
+        elif isinstance(col_index, int):
+            try:
+                col = self._columns[col_index]
+            except IndexError:
+                raise IndexError("Columns %d not found!" % col_index)
+            rows = self._rows[row_index]
+
+            return PlateRecord(
+                self.id,
+                filter(
+                    lambda x: x.id.endswith(col) and x.id[0] in rows,
+                    self._wells.values(),
+                ),
+            )
+
+        else:
+            rows = self._rows[row_index]
+            cols = self._columns[col_index]
+
+            return PlateRecord(
+                self.id,
+                filter(
+                    lambda x: x.id[0] in rows and x.id[1:] in cols, self._wells.values()
+                ),
+            )
+
+    def __setitem__(self, key, value):
+        if not isinstance(key, str):
+            raise ValueError("Well identifier should be string-like")
+        self._is_well(value)
+        # Provided key and well ID should be the same
+        if value.id != key:
+            raise ValueError(
+                "WellRecord ID and provided key are different (got '%s' and '%s')"
+                % (type(value.id), type(key))
+            )
+        self._wells[key] = value
+
+        self._update()
+
+    def __delitem__(self, key):
+        if not isinstance(key, str):
+            raise ValueError("Well identifier should be string-like")
+        del self._wells[key]
+
+        self._update()
+
+    def __iter__(self):
+        for well in sorted(self._wells):
+            yield self._wells[well]
+
+    def __contains__(self, wellid):
+        if wellid in self._wells:
+            return True
+        return False
+
+    def __len__(self):
+        """Return the number of wells in this plate."""
+        return len(self._wells)
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return self._wells == other._wells
+        else:
+            return False
+
+    def __add__(self, plate):
+        """Add another PlateRecord object.
+
+        The wells in both plates must be the same
+
+        A new PlateRecord object is returned, having the same id as the
+        left operand.
+        """
+        if not isinstance(plate, PlateRecord):
+            raise TypeError("Expecting a PlateRecord object")
+
+        if {x.id for x in self} != {x.id for x in plate}:
+            raise ValueError("The two plates have different wells")
+
+        wells = []
+
+        for w in self:
+            wells.append(w + plate[w.id])
+
+        newp = PlateRecord(self.id, wells=wells)
+
+        return newp
+
+    def __sub__(self, plate):
+        """Subtract another PlateRecord object.
+
+        The wells in both plates must be the same
+
+        A new PlateRecord object is returned, having the same id as the
+        left operand.
+        """
+        if not isinstance(plate, PlateRecord):
+            raise TypeError("Expecting a PlateRecord object")
+
+        if {x.id for x in self} != {x.id for x in plate}:
+            raise ValueError("The two plates have different wells")
+
+        wells = []
+
+        for w in self:
+            wells.append(w - plate[w.id])
+
+        newp = PlateRecord(self.id, wells=wells)
+
+        return newp
+
+    def get_row(self, row):
+        """Get all the wells of a given row.
+
+        A row is identified with a letter (e.g. 'A')
+        """
+        # Key is casted to str implicitly
+        try:
+            row = str(row)
+        except Exception:
+            # Is it even possible to get an exception here?
+            raise ValueError("Row identifier should be string-like")
+        if len(row) > 1:
+            raise ValueError("Row identifier must be of maximum one letter")
+
+        for w in sorted(filter(lambda x: x.startswith(row), self._wells)):
+            yield self._wells[w]
+
+    def get_column(self, column):
+        """Get all the wells of a given column.
+
+        A column is identified with a number (e.g. '6')
+        """
+        # Column is casted to int implicitly
+        try:
+            column = int(column)
+        except Exception:
+            raise ValueError("Column identifier should be a number")
+
+        # A 96-well plate has well numbers in two digits
+        for w in sorted(filter(lambda x: x.endswith("%02d" % column), self._wells)):
+            yield self._wells[w]
+
+    def subtract_control(self, control="A01", wells=None):
+        """Subtract a 'control' well from the other plates wells.
+
+        By default the control is subtracted to all wells, unless
+        a list of well ID is provided
+
+        The control well should belong to the plate
+        A new PlateRecord object is returned
+        """
+        if control not in self:
+            raise ValueError("Control well not present in plate")
+        wcontrol = self[control]
+
+        if wells is None:
+            wells = self._wells.keys()
+
+        missing = {w for w in wells if w not in self}
+        if missing:
+            raise ValueError("Some wells to be subtracted are not present")
+
+        nwells = []
+
+        for w in self:
+            if w.id in wells:
+                nwells.append(w - wcontrol)
+            else:
+                nwells.append(w)
+
+        newp = PlateRecord(self.id, wells=nwells)
+
+        return newp
+
+    def __repr__(self):
+        """Return a (truncated) representation of the plate for debugging."""
+        if len(self._wells) > 4:
+            # Show the last well and the first three
+            return "%s('%s, ..., %s')" % (
+                self.__class__.__name__,
+                ", ".join(
+                    [
+                        "%s['%s']" % (self[x].__class__.__name__, self[x].id)
+                        for x in sorted(self._wells.keys())[:3]
+                    ]
+                ),
+                "%s['%s']"
+                % (
+                    self[sorted(self._wells.keys())[-1]].__class__.__name__,
+                    self[sorted(self._wells.keys())[-1]].id,
+                ),
+            )
+        else:
+            return "%s(%s)" % (
+                self.__class__.__name__,
+                ", ".join(
+                    [
+                        "%s['%s']" % (self[x].__class__.__name__, self[x].id)
+                        for x in sorted(self._wells.keys())
+                    ]
+                ),
+            )
+
+    def __str__(self):
+        """Return a human readable summary of the record (string).
+
+        The python built in function str works by calling the object's ___str__
+        method.  e.g.
+
+        >>> from Bio import phenotype
+        >>> record = next(phenotype.parse("phenotype/Plates.csv", "pm-csv"))
+        >>> print(record)
+        Plate ID: PM01
+        Well: 96
+        Rows: 8
+        Columns: 12
+        PlateRecord('WellRecord['A01'], WellRecord['A02'], WellRecord['A03'], ..., WellRecord['H12']')
+
+        Note that long well lists are shown truncated.
+        """
+        lines = []
+        if self.id:
+            lines.append("Plate ID: %s" % self.id)
+        lines.append("Well: %i" % len(self))
+        # Here we assume that all well ID start with a char
+        lines.append("Rows: %d" % len({x.id[0] for x in self}))
+        # Here we assume that well number is a two-digit number
+        lines.append("Columns: %d" % len({x.id[1:3] for x in self}))
+        lines.append(repr(self))
+        return "\n".join(lines)
+
+
+class WellRecord:
+    """WellRecord stores all time course signals of a phenotype Microarray well.
+
+    The single time points and signals can be accessed iterating on the
+    WellRecord or using lists indexes or slices:
+
+    >>> from Bio import phenotype
+    >>> plate = phenotype.read("phenotype/Plate.json", "pm-json")
+    >>> well = plate['A05']
+    >>> for time, signal in well:
+    ...    print("Time: %f, Signal: %f" % (time, signal)) # doctest:+ELLIPSIS
+    ...
+    Time: 0.000000, Signal: 14.000000
+    Time: 0.250000, Signal: 13.000000
+    Time: 0.500000, Signal: 15.000000
+    Time: 0.750000, Signal: 15.000000
+    ...
+    >>> well[1]
+    16.0
+    >>> well[1:5]
+    [16.0, 20.0, 18.0, 15.0]
+    >>> well[1:5:0.5]
+    [16.0, 19.0, 20.0, 18.0, 18.0, 18.0, 15.0, 18.0]
+
+    If a time point was not present in the input file but it's between the
+    minimum and maximum time point, the interpolated signal is returned,
+    otherwise a nan value:
+
+    >>> well[1.3]
+    19.0
+    >>> well[1250]
+    nan
+
+    Two WellRecord objects can be compared: if their input time/signal pairs
+    are exactly the same, the two records are considered equal:
+
+    >>> well2 = plate['H12']
+    >>> well == well2
+    False
+
+    Two WellRecord objects can be summed up or subtracted from each other: a new
+    WellRecord object is returned, having the left operand id.
+
+    >>> well1 = plate['A05']
+    >>> well2 = well + well1
+    >>> print(well2.id)
+    A05
+
+    If SciPy is installed, a sigmoid function can be fitted to the PM curve,
+    in order to extract some parameters; three sigmoid functions are available:
+    * gompertz
+    * logistic
+    * richards
+    The functions are described in Zwietering et al., 1990 (PMID: 16348228)
+
+    For example::
+
+        well.fit()
+        print(well.slope, well.model)
+        (61.853516785566917, 'logistic')
+
+    If not sigmoid function is specified, the first one that is successfully
+    fitted is used. The user can also specify a specific function.
+
+    To specify gompertz::
+
+        well.fit('gompertz')
+        print(well.slope, well.model)
+        (127.94630059171354, 'gompertz')
+
+    If no function can be fitted, the parameters are left as None, except for
+    the max, min, average_height and area.
+    """
+
+    def __init__(self, wellid, plate=None, signals=None):
+        """Initialize the class."""
+        if plate is None:
+            self.plate = PlateRecord(None)
+        else:
+            self.plate = plate
+
+        self.id = wellid
+
+        # Curve parameters (to be calculated with the "fit" function)
+        # Parameters that don't need scipy
+        self.max = None
+        self.min = None
+        self.average_height = None
+
+        # Parameters that need scipy
+        self.area = None
+        self.plateau = None
+        self.slope = None
+        self.lag = None
+        self.v = None
+        self.y0 = None
+        self.model = None
+
+        # Original signals (private)
+        if signals is None:
+            self._signals = {}
+        else:
+            self._signals = signals
+
+    def _interpolate(self, time):
+        """Linear interpolation of the signals at certain time points (PRIVATE)."""
+        times = sorted(self._signals.keys())
+
+        return np.interp(
+            time, times, [self._signals[x] for x in times], left=np.nan, right=np.nan
+        )
+
+    def __setitem__(self, time, signal):
+        """Assign a signal at a certain time point."""
+        try:
+            time = float(time)
+        except ValueError:
+            raise ValueError("Time point should be a number")
+        try:
+            signal = float(signal)
+        except ValueError:
+            raise ValueError("Signal should be a number")
+
+        self._signals[time] = signal
+
+    def __getitem__(self, time):
+        """Return a subset of signals or a single signal."""
+        if isinstance(time, slice):
+            # Fix the missing values in the slice
+            if time.start is None:
+                start = 0
+            else:
+                start = time.start
+
+            if time.stop is None:
+                stop = max(self.get_times())
+            else:
+                stop = time.stop
+
+            time = np.arange(start, stop, time.step)
+            return list(self._interpolate(time))
+
+        elif isinstance(time, int) or isinstance(time, float):
+            return self._interpolate(time)
+
+        raise ValueError("Invalid index")
+
+    def __iter__(self):
+        for time in sorted(self._signals.keys()):
+            yield time, self._signals[time]
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            if list(self._signals.keys()) != list(other._signals.keys()):
+                return False
+            # Account for the presence of NaNs
+            for k in self._signals:
+                if np.isnan(self[k]) and np.isnan(other[k]):
+                    continue
+                elif self[k] != other[k]:
+                    return False
+            return True
+        else:
+            return False
+
+    def __add__(self, well):
+        """Add another WellRecord object.
+
+        A new WellRecord object is returned, having the same id as the
+        left operand
+        """
+        if not isinstance(well, WellRecord):
+            raise TypeError("Expecting a WellRecord object")
+
+        signals = {}
+
+        times = set(self._signals.keys()).union(set(well._signals.keys()))
+        for t in sorted(times):
+            signals[t] = self[t] + well[t]
+
+        neww = WellRecord(self.id, signals=signals)
+
+        return neww
+
+    def __sub__(self, well):
+        """Subtract another WellRecord object.
+
+        A new WellRecord object is returned, having the same id as the
+        left operand
+        """
+        if not isinstance(well, WellRecord):
+            raise TypeError("Expecting a WellRecord object")
+
+        signals = {}
+
+        times = set(self._signals.keys()).union(set(well._signals.keys()))
+        for t in sorted(times):
+            signals[t] = self[t] - well[t]
+
+        neww = WellRecord(self.id, signals=signals)
+
+        return neww
+
+    def __len__(self):
+        """Return the number of time points sampled."""
+        return len(self._signals)
+
+    def __repr__(self):
+        """Return a (truncated) representation of the signals for debugging."""
+        if len(self) > 7:
+            # Shows the last time point and the first five
+            return "%s('%s, ..., %s')" % (
+                self.__class__.__name__,
+                ", ".join([str(x) for x in self.get_raw()[:5]]),
+                str(self.get_raw()[-1]),
+            )
+        else:
+            return "%s(%s)" % (
+                self.__class__.__name__,
+                ", ".join([str(x) for x in self.get_raw()]),
+            )
+
+    def __str__(self):
+        """Return a human readable summary of the record (string).
+
+        The python built-in function str works by calling the object's ___str__
+        method.  e.g.
+
+        >>> from Bio import phenotype
+        >>> plate = phenotype.read("phenotype/Plate.json", "pm-json")
+        >>> record = plate['A05']
+        >>> print(record)
+        Plate ID: PM01
+        Well ID: A05
+        Time points: 384
+        Minum signal 0.25 at time 13.00
+        Maximum signal 19.50 at time 23.00
+        WellRecord('(0.0, 14.0), (0.25, 13.0), (0.5, 15.0), (0.75, 15.0), (1.0, 16.0), ..., (95.75, 16.0)')
+
+        Note that long time spans are shown truncated.
+        """
+        lines = []
+        if self.plate and self.plate.id:
+            lines.append("Plate ID: %s" % self.plate.id)
+        if self.id:
+            lines.append("Well ID: %s" % self.id)
+        lines.append("Time points: %i" % len(self))
+        lines.append("Minum signal %.2f at time %.2f" % min(self, key=lambda x: x[1]))
+        lines.append("Maximum signal %.2f at time %.2f" % max(self, key=lambda x: x[1]))
+        lines.append(repr(self))
+        return "\n".join(lines)
+
+    def get_raw(self):
+        """Get a list of time/signal pairs."""
+        return [(t, self._signals[t]) for t in sorted(self._signals.keys())]
+
+    def get_times(self):
+        """Get a list of the recorded time points."""
+        return sorted(self._signals.keys())
+
+    def get_signals(self):
+        """Get a list of the recorded signals (ordered by collection time)."""
+        return [self._signals[t] for t in sorted(self._signals.keys())]
+
+    def fit(self, function=("gompertz", "logistic", "richards")):
+        """Fit a sigmoid function to this well and extract curve parameters.
+
+        If function is None or an empty tuple/list, then no fitting is done.
+        Only the object's ``.min``, ``.max`` and ``.average_height`` are
+        calculated.
+
+        By default the following fitting functions will be used in order:
+         - gompertz
+         - logistic
+         - richards
+
+        The first function that is successfully fitted to the signals will
+        be used to extract the curve parameters and update ``.area`` and
+        ``.model``. If no function can be fitted an exception is raised.
+
+        The function argument should be a tuple or list of any of these three
+        function names as strings.
+
+        There is no return value.
+        """
+        avail_func = ("gompertz", "logistic", "richards")
+
+        # Parameters not dependent on curve fitting
+        self.max = max(self, key=lambda x: x[1])[1]
+        self.min = min(self, key=lambda x: x[1])[1]
+
+        self.average_height = np.array(self.get_signals()).mean()
+
+        if not function:
+            self.area = None
+            self.model = None
+            return
+        for sigmoid_func in function:
+            if sigmoid_func not in avail_func:
+                raise ValueError("Fitting function %r not supported" % sigmoid_func)
+
+        # Parameters that depend on scipy curve_fit
+        from .pm_fitting import fit, get_area
+        from .pm_fitting import logistic, gompertz, richards
+
+        function_map = {
+            "logistic": logistic,
+            "gompertz": gompertz,
+            "richards": richards,
+        }
+
+        self.area = get_area(self.get_signals(), self.get_times())
+
+        self.model = None
+        for sigmoid_func in function:
+            func = function_map[sigmoid_func]
+            try:
+                (self.plateau, self.slope, self.lag, self.v, self.y0), pcov = fit(
+                    func, self.get_times(), self.get_signals()
+                )
+
+                self.model = sigmoid_func
+                return
+            except RuntimeError:
+                continue
+        raise RuntimeError("Could not fit any sigmoid function")
+
+
+def JsonIterator(handle):
+    """Iterate over PM json records as PlateRecord objects.
+
+    Arguments:
+     - handle - input file
+
+    """
+    try:
+        data = json.load(handle)
+    except ValueError:
+        raise ValueError("Could not parse JSON file")
+
+    # We can have one single plate or several
+    # we need to discriminate
+    if hasattr(data, "keys"):
+        data = [data]
+
+    for pobj in data:
+        try:
+            plateID = pobj[_csvData][_plate]
+        except TypeError:
+            raise TypeError("Malformed JSON input")
+        except KeyError:
+            raise KeyError("Could not retrieve plate id")
+
+        # Parse also non-standard plate IDs
+        if not plateID.startswith(_platesPrefix) and not plateID.startswith(
+            _platesPrefixMammalian
+        ):
+            warnings.warn(
+                "Non-standard plate ID found (%s)" % plateID, BiopythonParserWarning
+            )
+        else:
+            # Simplify the plates IDs, removing letters, as opm does
+            if plateID.startswith(_platesPrefixMammalian):
+                pID = plateID[len(_platesPrefixMammalian) :]
+            else:
+                pID = plateID[len(_platesPrefix) :]
+            while len(pID) > 0:
+                try:
+                    int(pID)
+                    break
+                except ValueError:
+                    pID = pID[:-1]
+
+            # No luck
+            if len(pID) == 0:
+                warnings.warn(
+                    "Non-standard plate ID found (%s)" % plateID, BiopythonParserWarning
+                )
+            elif int(pID) < 0:
+                warnings.warn(
+                    "Non-standard plate ID found (%s), using %s"
+                    % (plateID, _platesPrefix + abs(int(pID)))
+                )
+                plateID = _platesPrefix + abs(int(pID))
+            else:
+                if plateID.startswith(_platesPrefixMammalian):
+                    plateID = _platesPrefixMammalian + "%02d" % int(pID)
+                else:
+                    plateID = _platesPrefix + "%02d" % int(pID)
+
+        try:
+            times = pobj[_measurements][_hour]
+        except KeyError:
+            raise KeyError("Could not retrieve the time points")
+
+        plate = PlateRecord(plateID)
+
+        for k in pobj[_measurements]:
+            # Skip the time points
+            if k == _hour:
+                continue
+
+            plate[k] = WellRecord(
+                k,
+                plate=plate,
+                signals={
+                    times[i]: pobj[_measurements][k][i] for i in range(len(times))
+                },
+            )
+
+        # Remove the measurements and assign the other qualifiers
+        del pobj["measurements"]
+        plate.qualifiers = pobj
+
+        yield plate
+
+
+def CsvIterator(handle):
+    """Iterate over PM csv records as PlateRecord objects.
+
+    Arguments:
+     - handle - input file
+
+    """
+    plate = None
+    data = False
+    qualifiers = {}
+    idx = {}
+    wells = {}
+
+    tblreader = csv.reader(handle, delimiter=",", quotechar='"')
+    for line in tblreader:
+        if len(line) < 2:
+            continue
+
+        elif _datafile in line[0].strip():
+            # Do we have a previous plate?
+            if plate is not None:
+                qualifiers[_csvData][_datafile] = line[1].strip()
+                plate = PlateRecord(plate.id)
+                for k, v in wells.items():
+                    plate[k] = WellRecord(k, plate, v)
+                plate.qualifiers = qualifiers
+                yield plate
+            plate = PlateRecord(None)
+            data = False
+            qualifiers[_csvData] = {}
+            idx = {}
+            wells = {}
+
+        elif _plate in line[0].strip():
+            plateID = line[1].strip()
+
+            qualifiers[_csvData][_plate] = plateID
+
+            # Parse also non-standard plate IDs
+            if not plateID.startswith(_platesPrefix) and not plateID.startswith(
+                _platesPrefixMammalian
+            ):
+                warnings.warn(
+                    "Non-standard plate ID found (%s)" % plateID, BiopythonParserWarning
+                )
+            else:
+                # Simplify the plates IDs, removing letters, as opm does
+                if plateID.startswith(_platesPrefixMammalian):
+                    pID = plateID[len(_platesPrefixMammalian) :]
+                else:
+                    pID = plateID[len(_platesPrefix) :]
+                while len(pID) > 0:
+                    try:
+                        int(pID)
+                        break
+                    except ValueError:
+                        pID = pID[:-1]
+
+                # No luck
+                if len(pID) == 0:
+                    warnings.warn(
+                        "Non-standard plate ID found (%s)" % plateID,
+                        BiopythonParserWarning,
+                    )
+                elif int(pID) < 0:
+                    warnings.warn(
+                        "Non-standard plate ID found (%s), using %s"
+                        % (plateID, _platesPrefix + abs(int(pID)))
+                    )
+                    plateID = _platesPrefix + abs(int(pID))
+                else:
+                    if plateID.startswith(_platesPrefixMammalian):
+                        plateID = _platesPrefixMammalian + "%02d" % int(pID)
+                    else:
+                        plateID = _platesPrefix + "%02d" % int(pID)
+
+            plate.id = plateID
+
+        elif _strainType in line[0].strip():
+            if plate is None:
+                continue
+            qualifiers[_csvData][_strainType] = line[1].strip()
+
+        elif _sample in line[0].strip():
+            if plate is None:
+                continue
+            qualifiers[_csvData][_sample] = line[1].strip()
+
+        elif _strainNumber in line[0].strip():
+            if plate is None:
+                continue
+            qualifiers[_csvData][_strainNumber] = line[1].strip()
+
+        elif _strainName in line[0].strip():
+            if plate is None:
+                continue
+            qualifiers[_csvData][_strainName] = line[1].strip()
+
+        elif _other in line[0].strip():
+            if plate is None:
+                continue
+            qualifiers[_csvData][_other] = line[1].strip()
+
+        elif _file in line[0].strip():
+            if plate is None:
+                continue
+            qualifiers[_csvData][_file] = line[1].strip()
+
+        elif _position in line[0].strip():
+            if plate is None:
+                continue
+            qualifiers[_csvData][_position] = line[1].strip()
+
+        elif _setupTime in line[0].strip():
+            if plate is None:
+                continue
+            qualifiers[_csvData][_setupTime] = line[1].strip()
+
+        elif _hour in line[0].strip():
+            if plate is None:
+                continue
+            data = True
+            for i in range(1, len(line)):
+                x = line[i]
+                if x == "":
+                    continue
+                wells[x.strip()] = {}
+                idx[i] = x.strip()
+
+        elif data:
+            if plate is None:
+                continue
+
+            # Workaround for bad-formatted files
+            try:
+                float(line[0])
+            except ValueError:
+                continue
+
+            time = float(line[0])
+
+            for i in range(1, len(line)):
+                x = line[i]
+
+                try:
+                    signal = float(x)
+                except ValueError:
+                    continue
+
+                well = idx[i]
+                wells[well][time] = signal
+
+    if plate is not None and plate.id is not None:
+        plate = PlateRecord(plate.id)
+        for k, v in wells.items():
+            plate[k] = WellRecord(k, plate, v)
+        plate.qualifiers = qualifiers
+        yield plate
+
+
+def _toOPM(plate):
+    """Transform a PlateRecord object into a dictionary (PRIVATE)."""
+    d = dict(plate.qualifiers.items())
+
+    d[_csvData] = {}
+    d[_csvData][_plate] = plate.id
+    d[_measurements] = {}
+    d[_measurements][_hour] = []
+    times = set()
+    for wid, w in plate._wells.items():
+        d[_measurements][wid] = []
+        for hour in w._signals:
+            times.add(hour)
+
+    for hour in sorted(times):
+        d[_measurements][_hour].append(hour)
+        for wid, w in plate._wells.items():
+            if hour in w._signals:
+                d[_measurements][wid].append(w[hour])
+            # This shouldn't happen
+            else:
+                d[_measurements][wid].append(float("nan"))
+
+    return d
+
+
+class JsonWriter:
+    """Class to write PM Json format files."""
+
+    def __init__(self, plates):
+        """Initialize the class."""
+        self.plates = plates
+
+    def write(self, handle):
+        """Write this instance's plates to a file handle."""
+        out = []
+        for plate in self.plates:
+            try:
+                out.append(_toOPM(plate))
+            except ValueError:
+                raise ValueError("Could not export plate(s) in JSON format")
+
+        handle.write(json.dumps(out) + "\n")
+
+        return len(out)
+
+
+if __name__ == "__main__":
+    from Bio._utils import run_doctest
+
+    run_doctest(verbose=0)
diff --git a/code/lib/Bio/phenotype/pm_fitting.py b/code/lib/Bio/phenotype/pm_fitting.py
new file mode 100644
index 0000000..db8ac12
--- /dev/null
+++ b/code/lib/Bio/phenotype/pm_fitting.py
@@ -0,0 +1,146 @@
+# Copyright 2014-2016 by Marco Galardini.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Growth curves fitting and parameters extraction for phenotype data.
+
+This module provides functions to perform sigmoid functions fitting to
+Phenotype Microarray data. This module depends on scipy curve_fit function.
+If not available, a warning is raised.
+
+Functions:
+logistic           Logistic growth model.
+gompertz           Gompertz growth model.
+richards           Richards growth model.
+guess_plateau      Guess the plateau point to improve sigmoid fitting.
+guess_lag          Guess the lag point to improve sigmoid fitting.
+fit                Sigmoid functions fit.
+get_area           Calculate the area under the PM curve.
+"""
+
+import numpy as np
+
+try:
+    from scipy.optimize.minpack import curve_fit
+    from scipy.integrate import trapz
+except ImportError:
+    from Bio import MissingPythonDependencyError
+
+    raise MissingPythonDependencyError("Install scipy to extract curve parameters.")
+
+
+def logistic(x, A, u, d, v, y0):
+    """Logistic growth model.
+
+    Proposed in Zwietering et al., 1990 (PMID: 16348228)
+    """
+    y = (A / (1 + np.exp((((4 * u) / A) * (d - x)) + 2))) + y0
+    return y
+
+
+def gompertz(x, A, u, d, v, y0):
+    """Gompertz growth model.
+
+    Proposed in Zwietering et al., 1990 (PMID: 16348228)
+    """
+    y = (A * np.exp(-np.exp((((u * np.e) / A) * (d - x)) + 1))) + y0
+    return y
+
+
+def richards(x, A, u, d, v, y0):
+    """Richards growth model (equivalent to Stannard).
+
+    Proposed in Zwietering et al., 1990 (PMID: 16348228)
+    """
+    y = (
+        A
+        * pow(
+            1
+            + (
+                v
+                + (np.exp(1 + v) * np.exp((u / A) * (1 + v) * (1 + (1 / v)) * (d - x)))
+            ),
+            -(1 / v),
+        )
+    ) + y0
+    return y
+
+
+def guess_lag(x, y):
+    """Given two axes returns a guess of the lag point.
+
+    The lag point is defined as the x point where the difference in y
+    with the next point is higher then the mean differences between
+    the points plus one standard deviation. If such point is not found
+    or x and y have different lengths the function returns zero.
+    """
+    if len(x) != len(y):
+        return 0
+
+    diffs = []
+    indexes = range(len(x))
+
+    for i in indexes:
+        if i + 1 not in indexes:
+            continue
+        diffs.append(y[i + 1] - y[i])
+    diffs = np.array(diffs)
+
+    flex = x[-1]
+    for i in indexes:
+        if i + 1 not in indexes:
+            continue
+        if (y[i + 1] - y[i]) > (diffs.mean() + (diffs.std())):
+            flex = x[i]
+            break
+
+    return flex
+
+
+def guess_plateau(x, y):
+    """Given two axes returns a guess of the plateau point.
+
+    The plateau point is defined as the x point where the y point
+    is near one standard deviation of the differences between the y points to
+    the maximum y value. If such point is not found or x and y have
+    different lengths the function returns zero.
+    """
+    if len(x) != len(y):
+        return 0
+
+    diffs = []
+    indexes = range(len(y))
+
+    for i in indexes:
+        if i + 1 not in indexes:
+            continue
+        diffs.append(y[i + 1] - y[i])
+    diffs = np.array(diffs)
+
+    ymax = y[-1]
+    for i in indexes:
+        if y[i] > (ymax - diffs.std()) and y[i] < (ymax + diffs.std()):
+            ymax = y[i]
+            break
+
+    return ymax
+
+
+def fit(function, x, y):
+    """Fit the provided function to the x and y values.
+
+    The function parameters and the parameters covariance.
+    """
+    # Compute guesses for the parameters
+    # This is necessary to get significant fits
+    p0 = [guess_plateau(x, y), 4.0, guess_lag(x, y), 0.1, min(y)]
+
+    params, pcov = curve_fit(function, x, y, p0=p0)
+    return params, pcov
+
+
+def get_area(y, x):
+    """Get the area under the curve."""
+    return trapz(y=y, x=x)
diff --git a/code/lib/Building_Literature_Embedding_Model.py b/code/lib/Building_Literature_Embedding_Model.py
new file mode 100644
index 0000000..06f27d6
--- /dev/null
+++ b/code/lib/Building_Literature_Embedding_Model.py
@@ -0,0 +1,303 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Aug 29 20:18:05 2022
+
+@author: Jihye Moon
+"""
+import os 
+import numpy as np
+import math
+import pathlib
+from Moon_gene2vec import Gene2vec
+import tensorflow as tf
+from tensorflow.contrib.tensorboard.plugins import projector
+
+class building_embedding_model():
+    def __int__(self):
+        return None
+    
+    def setting(self,preprocessed_path, vocab_dir, logs_dir, gene2doc_dir, baseline_doc_dir):
+        
+        self.baseline_doc_dir=baseline_doc_dir
+        self.logs_dir=logs_dir
+        self.gene2doc_dir=gene2doc_dir
+        self.vocab_dir=vocab_dir
+        
+        self.gene2document =  os.path.join(preprocessed_path, 'gene2document.data.doc.txt')
+        self.baseline_doc =  os.path.join(preprocessed_path, 'baseline_doc.data.doc.txt')
+
+        return None
+    
+    def dumpArrayFile(self,denseList, path, name):
+        np.asarray(denseList).dump(os.path.join(path, name+'.dat'))
+
+    def creating_vocab(self, min_count = 5, min_size = 2): 
+        g2v = Gene2vec()
+        gene2doc, gene1 = g2v.data_loading(self.gene2document)
+        baseline_doc, gene2 = g2v.data_loading(self.baseline_doc)
+        
+        vocaburary = g2v.vocab_output()
+        
+        removed_voc = g2v.selecting_vocab(vocaburary, min_count, min_size=min_size)  
+        gene_dict, gene_reverse_dict = g2v.gene_dic(gene1, removed_voc)
+         
+        g2v.vocab_save('excluded_sum'+str(min_count)+'two2doc', gene_dict, self.vocab_dir)
+           
+        self.gene2doc=gene2doc
+        self.gene1=gene1
+        self.baseline_doc=baseline_doc
+        self.gene_dict=gene_dict 
+        self.gene_reverse_dict=gene_reverse_dict
+        self.g2v=g2v
+        self.vocabulary_size = len(self.gene_reverse_dict)
+        
+    def creating_training_data_for_gene2doc(self, window_size):
+        g2v=self.g2v
+        window_size=window_size
+        gene1=self.gene1 
+        gene_dict=self.gene_dict
+        buf_batch=[]; buf_labels=[] 
+        
+        countData=0; indexing=0
+        
+        for i in range(len(self.gene2doc)):
+            save_batch, save_labels = g2v.gene2doc_batch_fucntion(self.gene2doc[i], gene1[i], i, window_size) 
+            save_batch, save_labels = g2v.gene_insert(save_batch,save_labels, gene_dict[gene1[i]]) 
+            save_batch3, save_labels3 = g2v.gene_additing(self.gene2doc[i], gene_dict[gene1[i]], i, window_size)
+            save_batch=np.concatenate((save_batch,save_batch3))
+            save_labels=np.concatenate((save_labels,save_labels3)) 
+                
+            buf_batch.extend(save_batch)
+            buf_labels.extend(save_labels) 
+            if countData==1000: 
+                self.dumpArrayFile(buf_batch, self.gene2doc_dir, 'batch.'+str(indexing))
+                self.dumpArrayFile(buf_labels, self.gene2doc_dir, 'label.'+str(indexing))
+                countData=0
+                buf_batch=[]; buf_labels=[]
+                indexing+=1
+            countData+=1
+        self.dumpArrayFile(buf_batch, self.gene2doc_dir, 'batch.'+str(indexing))
+        self.dumpArrayFile(buf_labels, self.gene2doc_dir, 'label.'+str(indexing)) 
+        del self.gene2doc
+
+    def checking_gene2doc_generation(self, window_size):
+        g2v=self.g2v
+        window_size=window_size
+        gene1=self.gene1 
+        gene_reverse_dict=self.gene_reverse_dict
+        
+        for i in range(1):
+            print("== Examples: ", gene1[i])
+            save_batch, save_labels = g2v.gene2doc_batch_fucntion(self.gene2doc[i], gene1[i], i, window_size) 
+            save_batch, save_labels = g2v.gene_insert(save_batch,save_labels, self.gene_dict[gene1[i]]) 
+            save_batch3, save_labels3 = g2v.gene_additing(self.gene2doc[i], self.gene_dict[gene1[i]], i, window_size)
+            save_batch=np.concatenate((save_batch,save_batch3)) 
+            save_labels=np.concatenate((save_labels,save_labels3))  
+        print("============================== Fig. 3(a) in the published paper ")
+        for k in range(30): 
+            print(gene_reverse_dict[save_batch[k]], '->' , gene_reverse_dict[save_labels[k]])
+        print("============================== Fig. 3(b) in the published paper")
+        for n in range(30): 
+            k=len(save_batch)-1-n
+            print(gene_reverse_dict[save_batch[k]], '->' , gene_reverse_dict[save_labels[k]]) 
+        
+    def creating_training_data_for_word2doc(self, window_size):
+        g2v=self.g2v
+        buf_batch=[]
+        buf_labels=[] 
+        window_size=window_size
+        countData=0; indexing=0
+        for i in range(len(self.baseline_doc)):
+            save_batch, save_labels = g2v.gene2doc_batch_fucntion(self.baseline_doc[i], 0, i, window_size)  
+            buf_batch.extend(save_batch)
+            buf_labels.extend(save_labels) 
+            if countData==50000:
+                print(i, len(self.baseline_doc))
+                self.dumpArrayFile(buf_batch, self.baseline_doc_dir, 'batch.'+str(indexing))
+                self.dumpArrayFile(buf_labels, self.baseline_doc_dir, 'label.'+str(indexing))
+                countData=0
+                buf_batch=[]; buf_labels=[]
+                indexing+=1
+            countData+=1
+        self.dumpArrayFile(buf_batch, self.baseline_doc_dir, 'batch.'+str(indexing))
+        self.dumpArrayFile(buf_labels, self.baseline_doc_dir, 'label.'+str(indexing)) 
+    
+    def model_setting(self, dimension, num_sampled):
+        self.vocabulary_size = len(self.gene_reverse_dict)
+        self.dimension = dimension
+        self.num_sampled = num_sampled
+            
+    def sorting_data_loading(self, data): 
+        batch=[]
+        label=[]
+        full_size=int(len(data)/2)
+        for i in range(full_size):
+            batch.append('batch.'+str(i)+'.dat')
+            label.append('label.'+str(i)+'.dat')
+        return batch, label
+    
+    def logs(self, name, word):
+        f = open(name+'_logs.txt','a') 
+        f.write('{}\n'.format(word))
+        f.close()
+        
+    def starting_sorting(self, model_path):
+        import argparse
+        print('starting making data')
+        logs_dir=self.logs_dir
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            '--log_dir',
+            type=str,
+            default=model_path,
+            help='The log directory for TensorBoard summaries.')
+
+        FLAGS, unparsed = parser.parse_known_args()
+        self.FLAGS = FLAGS
+        if not os.path.exists(FLAGS.log_dir):
+            os.makedirs(FLAGS.log_dir)
+        dir_names = os.listdir(logs_dir)
+        batch_list_dir=[]; target_list_dir=[]
+        for i in range(len(dir_names)):
+            if '.txt' not in dir_names[i]: 
+                print(i, dir_names[i])
+                data_dir = os.path.join(logs_dir, dir_names[i]) 
+                result = os.listdir(data_dir)
+                
+                batch_rd, label_rd = self.sorting_data_loading(result)
+                for j in range(len(batch_rd)):
+                    if 'batch' in batch_rd[j]:
+                        batch_list_dir.append(os.path.join(data_dir, batch_rd[j]))
+                        self.logs(os.path.join(FLAGS.log_dir, 'batch_list'), os.path.join(data_dir, batch_rd[j]))
+                for j in range(len(label_rd)):
+                    if 'label' in label_rd[j]:
+                        target_list_dir.append(os.path.join(data_dir, label_rd[j]))
+                        self.logs(os.path.join(FLAGS.log_dir, 'target_list'), os.path.join(data_dir, label_rd[j]))
+            self.target_list_dir=target_list_dir
+            self.batch_list_dir=batch_list_dir
+        
+    def batch(self, X, y, batch_size, name='batch'): 
+        n_size=len(X)
+        rd_idx = np.random.permutation(n_size) 
+        n_batches = n_size // batch_size
+        for idx in np.array_split(rd_idx, n_batches):
+            X_batch, y_batch = X[idx], y[idx]
+            yield X_batch, y_batch
+             
+    def model_training(self, epoch=10, batch_size=256): 
+        all_size = len(self.batch_list_dir)
+        vocabulary_size=self.vocabulary_size
+        dimension=self.dimension
+        num_sampled=self.num_sampled
+        
+        valid_size = 16   
+        valid_window = 100  
+        valid_examples = np.random.choice(valid_window, valid_size, replace=False)
+        num_steps = epoch
+        graph = tf.Graph()
+        
+        with graph.as_default(): 
+          # Input data.
+          with tf.name_scope('inputs'):
+            train_inputs = tf.placeholder(tf.int32, shape=[None])
+            train_labels = tf.placeholder(tf.int32, shape=[None, 1])
+            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
+        
+          with tf.device('/cpu:0'):
+            nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, dimension],
+                                            stddev=1.0 / math.sqrt(dimension)), name='nce_w')
+            nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name='nce_b')
+            embeddings = tf.Variable(tf.random_uniform([vocabulary_size, dimension], -1.0, 1.0), name='embed1') 
+            embed = tf.nn.embedding_lookup(embeddings, train_inputs, name='lookup')
+        
+          with tf.name_scope('loss'):
+            loss = tf.reduce_mean(
+                tf.nn.nce_loss(
+                    weights=nce_weights, 
+                    biases=nce_biases,
+                    labels=train_labels,
+                    inputs=embed,
+                    num_sampled=num_sampled,
+                    num_classes=vocabulary_size))
+            
+          tf.summary.scalar('loss', loss)
+          with tf.name_scope('optimizer'):
+            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
+        
+          norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
+          normalized_embeddings = embeddings / norm
+          valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
+          similarity = tf.matmul(
+              valid_embeddings, normalized_embeddings, transpose_b=True)
+        
+          merged = tf.summary.merge_all()
+          init = tf.global_variables_initializer()
+          saver = tf.train.Saver()
+        
+        savedloss=[]
+        with tf.Session(graph=graph) as session:
+            writer = tf.summary.FileWriter(self.FLAGS.log_dir, session.graph)
+            init.run()  
+                    
+            print('Initialized') 
+            average_loss = 0;
+            counting=0; total_counting=0;
+            with open(self.FLAGS.log_dir + '/metadata.tsv', 'w', encoding='UTF-8') as f:
+                for i in range(vocabulary_size):
+                    f.write(self.gene_reverse_dict[i] + '\n')
+            for step in range(num_steps): 
+                rd_idx = np.arange(all_size)
+                np.random.shuffle(rd_idx)
+                for rn in rd_idx:  
+                    batch_dir = self.batch_list_dir[rn]
+                    label_dir = self.target_list_dir[rn] 
+                    target=np.load(batch_dir, allow_pickle=True)
+                    label=np.load(label_dir, allow_pickle=True)
+                    loading_target=target
+                    loading_label=label
+                    full_size=len(loading_label) 
+                    if step % full_size==0:
+                        full_size=len(loading_label)
+                        rd = np.arange(full_size)
+                        np.random.shuffle(rd) 
+                        loading_label=loading_label[rd]
+                        loading_target=loading_target[rd]
+                        loading_target=loading_target[rd] 
+                    for X_batch, y_batch in self.batch(X= target, y=label, batch_size = batch_size): 
+                        y_batch=y_batch.reshape(-1,1)
+                        feed_dict = {train_inputs: X_batch, train_labels: y_batch} 
+                        run_metadata = tf.RunMetadata() 
+                        _, summary, loss_val = session.run(
+                            [optimizer, merged, loss],
+                            feed_dict=feed_dict,
+                            run_metadata=run_metadata)
+                        average_loss += loss_val
+                        counting+=1; total_counting+=1;
+                        writer.add_summary(summary, step)
+                writer.add_run_metadata(run_metadata, 'step%d' % step)
+                average_loss /= counting
+                print('Average loss at step ', step, '/', num_steps, ': ', average_loss) 
+                self.logs(os.path.join(self.FLAGS.log_dir, '128d'), str(step)+' '+str(average_loss))
+                savedloss.append(average_loss)
+                average_loss = 0
+                counting=0 
+                saver.save(session, os.path.join(self.FLAGS.log_dir, 'mid_model.ckpt')) 
+                self.dumpArrayFile(self.gene_reverse_dict, self.FLAGS.log_dir, 'name')  
+                sim = similarity.eval()
+                for i in range(valid_size):
+                    valid_word = self.gene_reverse_dict[valid_examples[i]]
+                    top_k = 8  # number of nearest neighbors
+                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
+                    log_str = 'Nearest to %s:' % valid_word
+                    for k in range(top_k):
+                      close_word = self.gene_reverse_dict[nearest[k]]
+                      log_str = '%s %s,' % (log_str, close_word)
+                    print(log_str)
+            config = projector.ProjectorConfig()
+            embedding_conf = config.embeddings.add()
+            embedding_conf.tensor_name = embeddings.name
+            embedding_conf.metadata_path = os.path.join(self.FLAGS.log_dir, 'metadata.tsv') 
+            
+            saver.save(session, os.path.join(self.FLAGS.log_dir, 'model.ckpt'))
+          
+        writer.close() 
\ No newline at end of file
diff --git a/code/lib/CVD_risk_factor_search.py b/code/lib/CVD_risk_factor_search.py
new file mode 100644
index 0000000..8e0fc1f
--- /dev/null
+++ b/code/lib/CVD_risk_factor_search.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Aug  3 13:57:25 2018
+
+@author: Jihye Moon
+"""
+
+class run_intrisic_evaluation():
+    def __int__(self):
+        return None
+    def setting(self, path, gene_symb):
+        import loading_literature_embedding as emb
+    
+        emb2simi=emb.embedding_vector()  
+        words_list, index2word, syn0norm, syn1norm = emb2simi.setting(path, gene_symb)
+        self.emb2simi=emb2simi
+    def running(self, query, output_path, Top_Words):
+        self.emb2simi.similarity_display(query, output_path, Top_Words)
+    
diff --git a/code/lib/ExpCohort_Generator.py b/code/lib/ExpCohort_Generator.py
new file mode 100644
index 0000000..603acf8
--- /dev/null
+++ b/code/lib/ExpCohort_Generator.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep  7 14:10:26 2022
+
+@author: moon
+"""
+
+import pandas as pd 
+import pathlib
+import Literature_Data_Preprocessing as ldpl
+feature_symbol = ['oca','bca','nit','fhha', 'sbld', 'pulrate']
+feature_name = ['Other cancer','Breast cancer','Nitrates','Family history of heart attack','Systolic Blood Pressure (from Waveform Analysis), (mmHg)','Pulse Rate (from Waveform Analysis), (beats/minute)']
+label_symbol=['mi','rca','ang','ptca','cbg']
+label_name = ['Myocardial Infarction (MI)', 'Resuscitated Cardiac Arrest', 'Angina Pectoris', 'Percutaneous Transluminal', 'Coronary Angioplasty (PTCA)']
+
+
+subject_number= 200
+import random 
+X=[]
+y=[]
+for n in range(subject_number):
+    buffer=[]
+    for i in range(len(feature_name)):
+        buffer.append(random.random())
+    X.append(buffer)
+    y.append([random.randint(0, 1)])
+Xt = pd.DataFrame(X, columns=feature_symbol)
+y = pd.DataFrame(y) 
+
+
+ldp=ldpl.preprocessing('', '', '', '') 
+variables_indexing = {}
+disease_variables_indexing = {}
+
+for i in range(len(feature_name)):  
+    buffer = ldp.sentence_preprocessor(feature_name[i]) 
+    variables_indexing[feature_symbol[i]] = buffer 
+    
+for i in range(len(label_name)):  
+    buffer = ldp.sentence_preprocessor(label_name[i]) 
+    disease_variables_indexing[label_symbol[i]] = buffer 
+    
+example_path='../../data/Example/'
+pathlib.Path(example_path).mkdir(parents=True, exist_ok=True)
+Xt.to_csv(example_path+'Example_X.csv')
+y.to_csv(example_path+'Example_y.csv')
+
+pd.DataFrame(variables_indexing.values()).to_csv(example_path+'variables_preprocessed_names.csv')
+pd.DataFrame(variables_indexing.keys()).to_csv(example_path+'variables_symbol.csv')
+
+pd.DataFrame(disease_variables_indexing.values()).to_csv(example_path+'target_variables_preprocessed_names.csv')
+pd.DataFrame(disease_variables_indexing.keys()).to_csv(example_path+'target_variables_symbol.csv')
diff --git a/code/lib/Literature_Data_Collection.py b/code/lib/Literature_Data_Collection.py
new file mode 100644
index 0000000..ec7993e
--- /dev/null
+++ b/code/lib/Literature_Data_Collection.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Aug 29 17:28:57 2022
+
+@author: Jihye Moon
+"""
+
+import os
+import time
+from lib.Loading_PudMed import ids_pudmed as pudmed
+
+class literature_data_collection():
+    def __init__(self, email, output_dir, document_output_dir, api_key=None):
+        self.output_dir=output_dir
+        self.document_output_dir=document_output_dir
+        self.email = email 
+        self.api_key = api_key
+
+    def text_open(self, path):
+        with open(path, 'r') as f:
+            data=f.read().strip().split('\n')
+        return data
+    
+    def data_split(self, key):
+        return key.split('#')
+    
+    def word_based_query_fit(self, year = None, user_term="heart"):
+        email = self.email
+        pud = pudmed() 
+        search_results, end_point = pud.search_list(user_term, year, email) 
+        return search_results, end_point 
+    
+    def collecting_doc_using_word_based_query(self, year = None, user_term="heart", gap = 50000, starting = 0, ixs = 0, test_end_point=0):
+        email = self.email
+        pud = pudmed() 
+        search_results, end_point = pud.search_list(user_term, year, email) 
+        batch = 10000
+        gap=gap
+        
+        if test_end_point != 0:
+            end_point = test_end_point # Test 
+            print('Checking data collection performance --- collecting until ',end_point,' documents')
+            
+        counting=round(end_point/gap) 
+         
+        starting = starting 
+        ending = starting + gap 
+
+        for ix in range(ixs, counting):
+            #if ix == counting-1:
+            #    starting = ending
+            #    ending = end_point
+            print(ix, '/', counting -1, ' | from ', starting, ' to ', ending) 
+            pud.search_full(ix, self.output_dir, search_results, starting, ending, batch)
+            starting = ending
+            ending = starting+gap 
+            time.sleep(1)
+
+            if ix == counting-1:
+                ending = end_point
+                
+    def collecting_doc_using_word_based_query2(self, batch = 10000, gap = 50000, starting = 0, ixs = 0, search_results={}, end_point=0):
+
+        pud = pudmed() 
+        
+        gap=gap 
+            
+        counting=round(end_point/gap) 
+         
+        starting = starting 
+        ending = starting + gap 
+
+        for ix in range(ixs, counting):
+            print(ix, '/', counting -1, ' | from ', starting, ' to ', ending) 
+            pud.search_full(ix, self.output_dir, search_results, starting, ending, batch)
+
+            starting = ending
+            ending = starting+gap 
+            if ix == counting-1:
+                ending = end_point
+    
+    def gene_based_query_fit(self, query_len, query_full, query_symbol):
+        self.query_len=query_len
+        self.query_symbol=query_symbol
+        self.query_full=query_full
+    
+    def collecting_doc_using_gene_based_query(self, year = None, batch_size = 10, starting = 0, query_len = 26335, end_point = 2634):
+        document_output_dir=self.document_output_dir
+        counting=starting*batch_size
+        query_len=self.query_len
+        query_symbol=self.query_symbol
+        query_full=self.query_full
+        email = self.email
+        pud = pudmed() 
+        
+        for i in range(starting, end_point+1): 
+            handle2 = open(os.path.join(document_output_dir, "FullText_symbol."+str(i)+".txt"), "w", encoding='utf-8')
+            handle_excluding2 = open(os.path.join(document_output_dir, "excluded_symbol."+str(i)+".txt"), "w", encoding='utf-8')
+            handle_meta2 = open(os.path.join(document_output_dir, "meta_symbol."+str(i)+".txt"), "w", encoding='utf-8') 
+            print('Collecting Gene2doc ',i , '/', end_point)
+            for j in range(batch_size):
+                if counting>=query_len-1:
+                    break; 
+                time.sleep(5)
+                LR2, FullText2, meta2 = pud.search_gene2doc(query_symbol[counting], email)
+         
+                if LR2!=[]:
+                    indexing2 = str(counting)+'\t'+query_symbol[counting]+'\t'+query_full[counting]+FullText2
+                    handle2.write(indexing2)
+                    handle_meta2.write(str(counting)+'\t'+query_symbol[counting]+'\t'+query_full[counting]+meta2)
+                else: 
+                    handle_excluding2.write(query_symbol[counting]+'\t'+query_full[counting]+'\n')
+                counting += 1 
+            handle_excluding2.close()
+            handle_meta2.close()
+            handle2.close() 
+ 
\ No newline at end of file
diff --git a/code/lib/Literature_Data_Preprocessing.py b/code/lib/Literature_Data_Preprocessing.py
new file mode 100644
index 0000000..0ff733f
--- /dev/null
+++ b/code/lib/Literature_Data_Preprocessing.py
@@ -0,0 +1,347 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jun 21 00:16:25 2020
+
+@author: Jihye Moon
+
+"""
+
+import os
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from io import StringIO
+from sklearn.feature_extraction import stop_words
+#from sklearn.feature_extraction import _stop_words as stop_words
+import re 
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+import Medline
+
+cachedStopWords = stop_words.ENGLISH_STOP_WORDS
+
+class preprocessing():
+    def __init__(self, data_dir, batch_dir, final_dir, preprocessed_dir): 
+        self.final_dir=final_dir
+        self.preprocessed_dir=preprocessed_dir
+        self.batch_dir=batch_dir
+        return None
+     
+    def Indexing(self, name, word):
+        f = open(name+'.txt','w') 
+        for i in range(len(word)):
+            f.write('{}\n'.format(word[i])) 
+        f.close()
+    
+    def batch_data_matching(self, full_path, including_list):
+        try:
+            arr_list=[]
+            dir_names = os.listdir(full_path)
+            j = 0; file_names = []
+            for dir_name in dir_names:
+                if dir_name in including_list:
+    
+                    arr = []
+                    i = 0
+                    full_dir_name = os.path.join(full_path, dir_name)
+                    if (os.path.isdir(full_dir_name)!=True):
+                        continue
+                    text_file_names = os.listdir(full_dir_name)
+                
+                    for text_file_name in text_file_names:
+                        full_text_file_name = os.path.join(full_dir_name, text_file_name)
+                        ext = os.path.splitext(full_text_file_name)[-1]
+                        if ext == '.txt': 
+                            arr.insert(i, full_text_file_name)
+                            i = i+1 
+                    
+                    file_names.append(dir_name)
+                    arr_list.insert(j, arr)
+                    j = j + 1 
+            return file_names, arr_list
+        except PermissionError:
+            pass
+    
+    def file_detection(self, data, name, point):
+        predata=[]
+        for i in range(len(data)):
+            if name in data[i]:
+                predata.append(data[i]) 
+        missing=[];sorting=[]
+        list_sorting=[]
+        for i in range(len(predata)):
+            buffer=predata[i].split('\\')
+            buffer_num = int(buffer[len(buffer)-1].split('.')[point])
+            sorting.append(buffer_num)
+            list_sorting.append([buffer_num, predata[i]])
+        sorting.sort()
+        list_sorting.sort()
+        arranged_list=[]
+        for i in range(len(predata)):
+            arranged_list.append(list_sorting[i][1])
+            if sorting[i]!=i:
+                missing.append(i) 
+                break;
+        return sorting, missing, arranged_list
+    
+    def combining_files(self, file_names, data_list, names, point):
+        arr_list={}
+        for i in range(len(file_names)):
+            #print(file_names[i]) 
+            sorting, missing, arranged_list = self.file_detection(data_list[i], names[i], point)
+            counting=0
+            extending=[]
+            if missing==[]:
+                for k in range(len(arranged_list)):
+                    with open(arranged_list[k], 'r') as f:
+                        data = f.read().strip()
+                        data = data.split('\n')
+                        if data!=['']:
+                            extending.extend(data)
+                    if counting==100:
+                        #print(k, '/', len(arranged_list))
+                        #print(data)
+                        counting=0
+                    counting+=1
+                arr_list[file_names[i]]=extending
+        return arr_list
+    
+    def combining_query2doc(self, file_names, data_list, names, point):
+        arr_list={}
+        for i in range(len(file_names)):
+            #print(file_names[i]) 
+            sorting, missing, arranged_list = self.file_detection(data_list[i], names[i], point)
+            counting=0
+            extending=[]
+            if missing==[]:
+                print("NONE MISSING")
+                for k in range(len(arranged_list)):
+                    with open(arranged_list[k], 'r') as f:
+                        data = f.read()
+                        data=data.split('\nPMID')
+                        full_data=[]
+                        for n in range(len(data)):
+                            if len(data[n])>0:
+                                full_data.append('\nPMID'+data[n])
+                        extending.extend(full_data)
+                    if counting==100:
+                        #print(k, '/', len(arranged_list))
+                        #print(data)
+                        counting=0
+                    counting+=1
+                arr_list[file_names[i]]=extending
+        return arr_list
+    
+    def Medine_mapping(self, data):  
+        LR=[]; TI=[]; AB=[]; MH=[]; RN=[]; PMID=[]; DCOM=[]
+        FullText=''; Meta=''
+        rec_file = StringIO(data)
+        medline_rec = Medline.read(rec_file)
+        if 'AB' in medline_rec:
+            if 'LR' in medline_rec:
+                LR.append(medline_rec['LR'])  
+            else:
+                LR.append('.')
+            if 'TI' in medline_rec:
+                TI.append(medline_rec['TI']) 
+            else:
+                TI.append('.')
+            if 'AB' in medline_rec:
+                AB.append(medline_rec['AB']) 
+            else:
+                AB.append('.')
+            if 'MH' in medline_rec:
+                MH.append(medline_rec['MH']) 
+            else:
+                MH.append('.')
+            if 'PMID' in medline_rec:
+                PMID.append(medline_rec['PMID']) 
+            else:
+                PMID.append('.') 
+            if 'DCOM' in medline_rec:
+                DCOM.append(medline_rec['DCOM']) 
+            else:
+                DCOM.append('.') 
+            if 'RN' in medline_rec:
+                RN.append(medline_rec['RN']) 
+            else:
+                RN.append('.') 
+                 
+        for i in range(len(AB)):
+            FullText += '#'+PMID[i]+'\t'+DCOM[i]+'\t'+LR[i]+'\t'+TI[i]+'\t'+AB[i] 
+            Meta += "\t@".join(RN[i])+'\t#'.join(MH[i])
+        FullText+='\n' 
+        Meta+='\n' 
+        return FullText, Meta
+    
+    def gene2doc_mapping(self, data_list):
+        gene2doc={}
+        total_size=len(data_list)
+        for i in range(total_size):
+            total_data=''
+            #print(i, '/', len(data_list), round(i/total_size,2)*100)
+            data = data_list[i].split('\t#')
+            gene = data[0].split('\t')[1]
+            data = data[1:len(data)]
+            if len(data)>=1:
+                for j in range(len(data)):        
+                    total_data += data[j].split('\t')[3] + ' ' + data[j].split('\t')[4] 
+            
+            if gene2doc.get(gene,-1) == -1:
+                gene2doc[gene] = total_data
+            else:
+                gene2doc[gene] += gene2doc[gene] + total_data + ' '
+        return gene2doc
+
+    def check_valid_word(self, word):
+        if word not in cachedStopWords:#is_english_word(word) and \
+            return True
+        else:
+            return False
+        
+    def stem_word(self, word):
+        ps = PorterStemmer()
+        return ps.stem(word)
+    
+    def replace_all(self, text):
+        patterns= [r'[^\w\s]']
+        for p in patterns:
+            match= re.findall(p, text)
+            for m in match:
+                if m != '-': 
+                    text = text.replace(m, ' ') 
+        return text
+    
+    def replace_num(self, text):
+        patterns= ['[0-9]+']
+        for p in patterns:
+            match= re.findall(p, text)
+            for m in match:
+                if ' '+m+' ' in text: 
+                    text = text.replace(m, ' ')
+        return text
+    
+    def replace_single_num(text):
+        text = text.replace('-', '')
+        patterns= ['[0-9]+']
+        for p in patterns:
+            match= re.findall(p, text)
+        if len(match)>0:
+            if len(text) == len(match[0]):
+                single=0
+            else:
+                single=1
+        else:
+            single=1
+            
+        return single
+       
+    def removal_unwanted_pos(self, data):
+        unwanted = ['IN', 'DT', 'PRP', 'RB', 'PRP$', 'WRB', 'MD', 'TO', 'RB', 'RBR', 'RBS', 'CC', 'EX'] 
+        unwanted = ['IN', 'DT', 'PRP', 'PRP$', 'WRB', 'MD', 'TO', 'RBR', 'RBS', 'CC', 'EX', 'RBR', 'RBS'] 
+        text=nltk.pos_tag(word_tokenize(data))
+        results = ''
+        for txt, pos in text:
+            if pos not in unwanted:
+                results+=txt+' '
+        return results
+
+    def sentence_preprocessor(self, sentence, stem=False): 
+        sentence = self.removal_unwanted_pos(sentence)
+        sentence = sentence.lower()   
+        sentence = self.replace_all(sentence) 
+        sentence = sentence.replace('.', '. ')
+        sentence = re.sub('[0-9]+', '#', sentence) 
+    
+        new_sentence = "" 
+        words = sentence.split(' ')
+        for word in words: 
+            if stem == True:
+                word = self.stem_word(word)
+            else:
+                word = word
+            if (self.check_valid_word(word)):
+                new_sentence += word + " "
+         
+        
+        new_sentence = new_sentence.replace(' - ', ' ') 
+        new_sentence = new_sentence.replace('- ', '-# ') 
+        new_sentence = new_sentence.replace(' -', ' #-') 
+        
+        new_sentence = new_sentence.replace(' -# ', ' ') 
+        new_sentence = new_sentence.replace(' #- ', ' ') 
+        new_sentence = new_sentence.replace(' -#- ', ' ') 
+        
+        new_sentence = new_sentence.replace(' # ', ' ') 
+        new_sentence = new_sentence.replace(' - ', ' ') 
+         
+        new_sentence = new_sentence + ' ' 
+        new_sentence = new_sentence.strip()
+    
+        return new_sentence
+    
+    def doc_preprocessor(self, sentence, stem=False): 
+        sentence = self.removal_unwanted_pos(sentence)
+        sentence = sentence.lower()   
+        sentence = self.replace_all(sentence) 
+        sentence = sentence.replace('.', '. ')
+        sentence = re.sub('[0-9]+', '#', sentence) 
+    
+        new_sentence = "" 
+        words = sentence.split(' ')
+        for word in words:
+            # for each word, stem it and check if it is in English dictionary and stopword or not
+            if stem == True:
+                word = self.stem_word(word)
+            else:
+                word = word
+            if (self.check_valid_word(word)):
+                new_sentence += word + " "
+         
+        new_sentence = new_sentence.replace(' - ', ' ') 
+        new_sentence = new_sentence.replace('- ', '-# ') 
+        new_sentence = new_sentence.replace(' -', ' #-') 
+        
+        new_sentence = new_sentence.replace(' -# ', ' ') 
+        new_sentence = new_sentence.replace(' #- ', ' ') 
+        new_sentence = new_sentence.replace(' -#- ', ' ') 
+        
+        new_sentence = new_sentence.replace(' # ', ' ') 
+        new_sentence = new_sentence.replace(' - ', ' ') 
+        
+        # remove uninformative words 
+        new_sentence = new_sentence + ' ' 
+        new_sentence = new_sentence.strip()
+    
+        return new_sentence
+    
+    
+    def making_doc_data(self, gene_list, name, dic):
+        preprocessed_dir=self.preprocessed_dir
+        counting=0
+        handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), "w")
+        if gene_list == None:
+            for i in range(len(dic)): 
+                if counting==10000:
+                    print(i, '/', len(dic))
+                    counting=0
+                buffer = dic[i].split('\t')
+                if buffer[0] != '\n':
+                    buffer = buffer[3] + buffer[4]
+                    if buffer != '':
+                        buffer = self.doc_preprocessor(buffer) 
+                        handle.write('-1' + '\t' + buffer + '\n')
+                counting+=1
+                
+        else:
+            for i in range(len(gene_list)): 
+                if counting==1000:
+                    print(i, '/', len(gene_list))
+                    counting=0
+                data = dic[gene_list[i]] 
+                buffer = self.doc_preprocessor(data)
+                if buffer != '':
+                    handle.write('#'+ gene_list[i] + '\t' + buffer + '\n')
+                counting+=1
+        handle.close()
+
diff --git a/code/lib/Loading_PudMed.py b/code/lib/Loading_PudMed.py
new file mode 100644
index 0000000..43916a5
--- /dev/null
+++ b/code/lib/Loading_PudMed.py
@@ -0,0 +1,229 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun 10 00:25:50 2020
+
+@author: Jihye Moon
+
+"""
+import sys
+from lib.Bio import Entrez
+import lib.Bio
+from datetime import datetime
+from io import StringIO
+import time
+sys.path.append('lib')
+
+import lib.Medline
+import os
+
+date = startTime = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+
+class ids_pudmed():
+    def __init__(self, snp_ids=[]):
+        self.snp_ids=snp_ids
+        self.uids=[]
+        self.gene_names=[]
+        self.names=[]
+        self.records=[]
+        self.gene_full_names=[]
+        self.saved_snp_id=[]
+        
+    def search_ids(self, search_email):
+        removal_index=[]
+        Entrez.email = search_email
+        records=[]
+        for snp_id in self.snp_ids:
+            record = Entrez.read(Entrez.elink(dbfrom="snp", 
+                                  id=snp_id.replace('rs',''), 
+                                  db="gene")) 
+            if record[0]['LinkSetDb']==[]:
+                removal_index.append(snp_id)
+                print("index is removed: ", snp_id)
+                
+            else:
+                results = record[0]['LinkSetDb'][0]['Link']
+                multi_gene=[]
+                multi_full_name=[]
+                multi_uid=[]
+                #records=[]
+                for result in results:
+                    uid = result['Id']
+                    handle = Entrez.esummary(db="gene", id=uid)
+                    uid_record = Entrez.read(handle)
+                    
+                    records.append(uid_record)
+                    handle.close()
+                    uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0]
+                    gene_name = uid_summary['Name']
+                    gene_full_name = uid_summary['Description']
+                    if len(results)>1:
+                        multi_gene.append(gene_name)
+                        multi_full_name.append(gene_full_name)
+                        multi_uid.append(uid)
+                        
+                        #records.append(uid_record)
+                    else:
+                        multi_gene = gene_name
+                        multi_full_name = gene_full_name
+                        multi_uid = uid
+                        #records = uid_record
+            
+                #print(results)
+            
+                if len(results)>1:
+                    multi_uid= "#".join(multi_uid)
+                    multi_gene= "#".join(multi_gene) 
+                    multi_full_name= "#".join(multi_full_name) 
+                    #records= " ".join(records) 
+                
+                #print(count, "/",len(self.snp_ids)," : ", snp_id, multi_uid, multi_gene)
+                self.uids.append(multi_uid)
+                self.gene_names.append(multi_gene)
+                self.gene_full_names.append(multi_full_name)
+                self.saved_snp_id.append(snp_id)
+                #self.records.append(records) 
+        return removal_index, self.records, self.uids, self.gene_names, self.gene_full_names
+        #return records
+    def search_id2summary(self, uids, search_email): 
+        Entrez.email = search_email
+        records=''
+        for uid in uids: 
+            summary='#'
+            handle = Entrez.esummary(db="gene", id=uid)
+            #uid_record = Entrez.read(handle) 
+            uid_record = Entrez.read(handle,validate=False)
+            #records.append(uid_record)
+            handle.close()
+            #print( uid_record["DocumentSummarySet"]['DocumentSummary'])
+            if uid_record["DocumentSummarySet"]['DocumentSummary']==[]:    
+                handle = Entrez.esummary(db="gene", id=uid)
+                uid_record = Entrez.read(handle) 
+                handle.close()
+                uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0]
+            else:
+                uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0]
+            gene_name = uid_summary['Name']
+            gene_full_name = uid_summary['Description']
+            if 'Summary' in uid_summary:
+                summary = uid_summary['Summary']
+                if summary == '':
+                    summary = '.'
+            sentence = uid + '\t' + gene_name + '\t' + gene_full_name + '\t' + summary
+            records += sentence + '\n'
+        return records
+
+    def search_gene2doc(self, query, email):
+        LR=[]; TI=[]; AB=[]; MH=[]; RN=[]; PMID=[]; DCOM=[]
+        rec_handler = self.search_medline(query, email)
+
+        FullText=''; Meta=''
+        for rec_id in rec_handler['IdList']:
+            rec = self.fetch_rec(rec_id, rec_handler)
+            rec_file = StringIO(rec)
+            medline_rec = Medline.read(rec_file)  
+            if medline_rec != []:
+                if 'LR' in medline_rec:
+                    LR.append(medline_rec['LR'])  
+                else:
+                    LR.append('.')
+                if 'TI' in medline_rec:
+                    TI.append(medline_rec['TI']) 
+                else:
+                    TI.append('.')
+                if 'AB' in medline_rec:
+                    AB.append(medline_rec['AB']) 
+                else:
+                    AB.append('.')
+                if 'MH' in medline_rec:
+                    MH.append(medline_rec['MH']) 
+                else:
+                    MH.append('.')
+                if 'PMID' in medline_rec:
+                    PMID.append(medline_rec['PMID']) 
+                else:
+                    PMID.append('.') 
+                if 'DCOM' in medline_rec:
+                    DCOM.append(medline_rec['DCOM']) 
+                else:
+                    DCOM.append('.') 
+                if 'RN' in medline_rec:
+                    RN.append(medline_rec['RN']) 
+                else:
+                    RN.append('.') 
+        for i in range(len(AB)):
+            FullText += '\t#'+PMID[i]+'\t'+DCOM[i]+'\t'+LR[i]+'\t'+TI[i]+'\t'+AB[i] 
+            Meta += "\t@".join(RN[i])+'\t#'.join(MH[i])
+        FullText+='\n' 
+        Meta+='\n' 
+        return AB, FullText, Meta
+     
+    def search_medline(self, query, email):
+        Entrez.email = email
+        search = Entrez.esearch(db='pubmed', term=query, usehistory='y')
+        
+        handle = Entrez.read(search)
+        try:
+            return handle
+        except Exception as e:
+            raise IOError(str(e))
+        finally:
+            search.close() 
+
+    def search_list(self, query, year, email): 
+        self.user_term = query
+        self.email = email
+        self.year=year
+        self.user_db="pubmed"
+        
+        Entrez.email = email
+        if year==None:
+            search_results = Entrez.read(
+                Entrez.esearch(
+                    db=self.user_db, term=self.user_term, datetype="pdat", usehistory="y"
+                    )
+                )
+            self.name = 'full'
+        else:
+            user_reldate = 365*year
+            search_results = Entrez.read(
+                Entrez.esearch(
+                    db=self.user_db, term=self.user_term, reldate=user_reldate, datetype="pdat", usehistory="y"
+                    #db=self.user_db, term=user_term, datetype="pdat", usehistory="y"
+                    )
+                )
+            self.name = str(year)
+
+        count = int(search_results["Count"]) 
+        return search_results, count
+    
+    def search_full(self, ix, data_dir, search_results, starting, count, batch): 
+        batch_size = batch
+        out_handle = open(os.path.join(data_dir, self.user_db+'.'+self.user_term+"."+str(ix)+"."+self.name+".txt"), "w", encoding='utf-8') 
+        for start in range(starting, count, batch_size):
+            end = min(count, start + batch_size) 
+            if end == count:
+                batch = end - start 
+            print("Going to download records from %i to %i" % (start + 1, end))
+            fetch_handle = Entrez.efetch(
+                db="pubmed",
+                rettype="medline",
+                retmode="text",
+                retstart=start,
+                retmax=batch_size,
+                webenv=search_results["WebEnv"],
+                query_key=search_results["QueryKey"],
+                )
+            data = fetch_handle.read()
+            fetch_handle.close()
+            out_handle.write(data)
+            time.sleep(2)  # Delay between each batch fetch to respect the API rate limit
+        out_handle.close()
+        
+    def fetch_rec(self, rec_id, entrez_handle):
+        fetch_handle = Entrez.efetch(db='pubmed', id=rec_id,
+                                 rettype='Medline', retmode='text',
+                                 webenv=entrez_handle['WebEnv'],
+                                 query_key=entrez_handle['QueryKey'])
+        rec = fetch_handle.read()
+        return rec
+
diff --git a/code/lib/ML_models.py b/code/lib/ML_models.py
new file mode 100644
index 0000000..9bb602d
--- /dev/null
+++ b/code/lib/ML_models.py
@@ -0,0 +1,540 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug 30 22:02:06 2022
+
+@author: Jihye Moon
+"""
+import numpy as np
+import pathlib
+import pandas as pd
+import os
+
+import tensorflow as tf  
+
+from sklearn.linear_model import LogisticRegression 
+from sklearn.svm import LinearSVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier 
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import StandardScaler    
+
+from sklearn.feature_selection import SelectKBest
+from sklearn.feature_selection import f_regression
+from sklearn.ensemble import ExtraTreesClassifier
+from skfeature.function.similarity_based import fisher_score
+
+from imblearn.over_sampling import ADASYN, SMOTE
+
+class dimension_reducers():
+    def __int__(self):
+        return None
+    
+    def PCA(self, X_train, X_test, X_valid, dim):
+        from sklearn.decomposition import PCA 
+        scaler =StandardScaler()
+        pca = PCA(n_components=dim)
+        C1 = pca.fit_transform(X_train)  
+        C2 = pca.fit_transform(X_test)
+        C3 = pca.fit_transform(X_valid)
+        
+        scaler.fit(C1)
+        C1 = scaler.transform(C1)
+        C2 = scaler.transform(C2)
+        C3 = scaler.transform(C3) 
+        return C1, C2, C3
+    
+    def UMAP(self, X_train, X_test, X_valid, dim): 
+        import umap 
+        scaler =StandardScaler()
+        reducer = umap.UMAP(n_components=dim) 
+        reducer.fit(X_train)
+        B1 = reducer.transform(X_train)  
+        reducer.fit(X_test)
+        B2 = reducer.transform(X_test)
+        reducer.fit(X_valid)
+        B3 = reducer.transform(X_valid)
+        
+        scaler.fit(B1)
+        B1 = scaler.transform(B1)
+        B2 = scaler.transform(B2)
+        B3 = scaler.transform(B3) 
+        return B1, B2, B3
+    
+    def Our_DR(self, reduced_emb0, X_train, X_test, X_valid, dim):
+        scaler =StandardScaler()
+        A2=np.matmul(X_test, reduced_emb0) 
+        A1=np.matmul(X_train, reduced_emb0)  
+        A3=np.matmul(X_valid, reduced_emb0) 
+        
+        scaler.fit(A1)
+        A1 = scaler.transform(A1)
+        A2 = scaler.transform(A2)
+        A3 = scaler.transform(A3) 
+        return A1, A2, A3
+    
+class feature_selectors():
+    def __int__(self):
+        return None
+    
+    def dataTump(self, result_dir, word, name):
+        f = open(result_dir+'/'+name+'logs.txt','a') 
+        f.write('{}\t'.format(word))
+        f.write('\n') 
+        
+    def H2FS_fit(self, X_train, y_train, feature_size):
+        fnn = round(feature_size*0.5)
+        wg=self.HFS(X_train,y_train, feature_size)
+        hf_score=list(wg.values())
+        hf_idx = np.argsort(hf_score).tolist()
+        hf_idx = hf_idx[::-1][0:fnn]
+        self.hf_idx=hf_idx
+        
+    def H2FS_transform(self, X):
+        new_X = X[:,self.hf_idx]
+        #X_test3 = X_test[:,hf_idx]
+        #X_valid3 = X_valid[:,hf_idx]
+        return new_X
+    
+    def HFS(self, X_train, y_train, feature_size):
+        all_feature=X_train.shape[1]
+        weights = {}
+        for i in range(all_feature):
+            weights[i]=0
+        fis_idx, f1_idx, et_idx = self.HFS_FS(X_train, y_train)
+        
+        cases = [fis_idx, f1_idx, et_idx]
+        fns=[round(feature_size*0.3), round(feature_size*0.4),round(feature_size*0.5)] # 30%, 40%, and 50% of all features. Refer original H2FS paper 
+        count=0
+        for case in cases: 
+            for fn in fns: 
+                selected_features=case[0:fn] 
+                acc1, acc2, acc3, acc4, acc5 = self.HFS_CS(X_train[:,selected_features], y_train)
+                acc=[acc1, acc2, acc3, acc4, acc5] 
+                for sf in selected_features:
+                    weights[sf]=sum(acc)
+            count+=1
+        return weights
+    
+    def HFS_CS(self, X_train, y_train):
+        from sklearn.naive_bayes import GaussianNB
+        from sklearn.neighbors import KNeighborsClassifier
+        from sklearn.naive_bayes import BernoulliNB
+        clf = GaussianNB()
+        clf.fit(X_train, y_train)
+        
+        acc1= clf.score(X_train, y_train)
+        neigh = KNeighborsClassifier()
+        neigh.fit(X_train, y_train)
+        
+        acc2= neigh.score(X_train, y_train)
+        clf = BernoulliNB() 
+        clf.fit(X_train, y_train)
+        
+        acc3= clf.score(X_train, y_train)
+        clf = DecisionTreeClassifier()
+        clf.fit(X_train, y_train)
+        
+        acc4= clf.score(X_train, y_train)
+        clf = RandomForestClassifier()
+        clf.fit(X_train, y_train)
+        acc5= clf.score(X_train, y_train)
+        return acc1, acc2, acc3, acc4, acc5
+    
+    def HFS_FS(self, X_train, y_train, fn=282): 
+        fis_idx = fisher_score.fisher_score(X_train, y_train, mode='rank') #returns rank directly instead of fisher score. so no need for feature_ranking
+        fis_idx=fis_idx[0:fn]
+        
+        f1_clf=SelectKBest(f_regression, k=fn).fit(X_train,y_train)
+        f1_score=f1_clf.scores_ 
+        f1_idx = np.argsort(f1_score).tolist()
+        f1_idx = f1_idx[::-1][0:fn]
+        
+        rnd_clf = ExtraTreesClassifier()
+        rnd_clf.fit(X_train, y_train) 
+        et_score=rnd_clf.feature_importances_ 
+        et_idx = np.argsort(et_score).tolist()
+        et_idx = f1_idx[::-1][0:fn]
+        return fis_idx, f1_idx, et_idx
+      
+    def Our_FS(self, emb2simi, name, embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, embedding, target_embedding_list, index2target, index2variables, target_embedding, feature_size, result_dir): 
+        gene_name = '../gene_name_info/query_full_name'; gene_symb='../gene_name_info/query_symbol' 
+        _, embed_name = emb2simi.target2variable(" ".join(list(disease_variables_indexing.keys())), target_embedding, target_embedding_list, embedding, embedding_list, index2variables, variables_indexing, feature_size)
+        df = pd.DataFrame(embed_name)
+        df.to_csv(os.path.join(result_dir, name+'.csv'), index=False)  
+        print('Selected features by our FS was saved in' , result_dir)
+
+        return embed_name
+    def RF(self, ix, X_train, y_train, X_test, y_test, names, result_dir):
+        from sklearn.ensemble import RandomForestClassifier
+        from sklearn.model_selection import GridSearchCV 
+        rnd_grid = [
+            {'n_estimators': [128, 256, 384], 'max_features': [128]}, 
+            ]
+        rnd_clf = RandomForestClassifier()
+        grid_search3 = GridSearchCV(rnd_clf, rnd_grid, cv=None, scoring='accuracy', return_train_score=True)
+        grid_search3.fit(X_train, y_train)
+        best_param=grid_search3.best_params_
+      
+        rnd_clf = RandomForestClassifier(**best_param)
+        rnd_clf.fit(X_train, y_train)
+        values=rnd_clf.feature_importances_ 
+        indices = np.argsort(values).tolist()
+        indices = indices[::-1]
+        for i in range(len(indices)):
+            rlt = str(names[indices[i]])+' '+str(values[indices[i]])+' '+str(indices[i])
+            self.dataTump(result_dir, rlt,ix+' RF') 
+        print('Selected features by RF was saved in' , result_dir)
+        return names, values, indices
+
+    def DT(self, ix, X_train, y_train, X_test, y_test, names, result_dir):
+        from sklearn.tree import DecisionTreeClassifier
+        from sklearn.model_selection import GridSearchCV 
+        rnd_grid = [
+            {'max_features': [128], 'max_depth':[3, 5], 'max_leaf_nodes':[3, 5]}, 
+            ] 
+        rnd_clf = DecisionTreeClassifier()
+        grid_search3 = GridSearchCV(rnd_clf, rnd_grid, cv=None, scoring='accuracy', return_train_score=True)
+        grid_search3.fit(X_train, y_train)
+        best_param=grid_search3.best_params_
+    
+        rnd_clf = DecisionTreeClassifier(**best_param)
+        rnd_clf.fit(X_train, y_train)
+        values=rnd_clf.feature_importances_ 
+        indices = np.argsort(values).tolist()
+        indices = indices[::-1]
+        for i in range(len(indices)):
+            rlt = str(names[indices[i]])+' '+str(values[indices[i]])+' '+str(indices[i]) 
+            self.dataTump(result_dir, rlt,ix+' DT')
+        print('Selected features by DT was saved in' , result_dir)
+        return names, values, indices
+
+class predictors():
+    def __init__(self):
+        return None
+    
+    def reset_graph(self, seed=42):
+      tf.reset_default_graph()
+      tf.set_random_seed(seed)
+      np.random.seed(seed)
+      
+    def batch(self, X, y, batch_size, name='batch'):  
+        n_size=len(X)
+        rd_idx = np.random.permutation(n_size)  
+        n_batches = n_size // batch_size
+        for idx in np.array_split(rd_idx, n_batches):
+            X_batch, y_batch = X[idx], y[idx]
+            yield X_batch, y_batch
+            
+    def softmax(self, sx, name='softmax'):  
+        sfxmax=[]
+        for i in range(len(sx)):
+            sfxmax.append((np.exp(sx[i])/np.sum(np.exp(sx),axis=1)))
+        return sfxmax 
+
+    def CNN_train(self, X_train, _y_train, X_test, _y_test, X_valid, _y_valid, n_inputs_label=2):   
+        X_train = X_train.reshape(-1, X_train.shape[1], 1) 
+        X_test = X_test.reshape(-1, X_test.shape[1], 1)
+        X_valid = X_valid.reshape(-1, X_valid.shape[1], 1)
+            
+        _y_train = np.squeeze(_y_train)
+        _y_test = np.squeeze(_y_test)  
+        _y_valid = np.squeeze(_y_valid) 
+             
+        n_outputs = 2
+    
+        print("Class: ",n_outputs)
+    
+        learning_rate = 0.001 
+    
+        self.reset_graph()
+    
+        channels = 1
+        n_inputs = n_inputs_label
+        print(n_inputs)
+    
+        conv1_fmaps = 16  
+        conv1_ksize = [3]
+        conv1_stride = [2] 
+    
+        conv_pad = "SAME"  
+        n_fc1 = 64  
+        n_outputs = 2  
+             
+        folder_path="../../results/CNN_model"
+        pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)  
+    
+        graph = tf.Graph()
+    
+        with graph.as_default():
+            
+            with tf.name_scope("inputs"):
+                input_X = tf.placeholder(tf.float32, shape=[None, n_inputs, channels], name="X") 
+                input_y = tf.placeholder(tf.int32, shape=[None], name="y")
+                keep_prob = tf.placeholder(tf.float32) 
+    
+            with tf.name_scope("conv"):  
+                conv1 = tf.layers.conv1d(input_X, filters=conv1_fmaps, kernel_size=conv1_ksize,
+                             strides=conv1_stride, padding=conv_pad,
+                             activation=tf.nn.elu, name="conv1") 
+                pool1 = tf.layers.max_pooling1d(conv1, pool_size=2, strides=1, padding='SAME')
+                drop_out1 = tf.nn.dropout(pool1, keep_prob)
+                conv2 = tf.layers.conv1d(drop_out1, filters=conv1_fmaps, kernel_size=conv1_ksize,
+                             strides=conv1_stride, padding=conv_pad,
+                             activation=tf.nn.elu, name="conv2")
+                pool2 = tf.layers.max_pooling1d(conv2, pool_size=2, strides=1, padding='SAME')
+     
+            with tf.name_scope("conv2"): 
+                [a,b,c] = pool2.shape
+                pool8_flat = tf.reshape(conv2, shape=[-1, int(b) * int(c)])
+                drop_out9 = tf.nn.dropout(pool8_flat, keep_prob)
+    
+            with tf.name_scope("fc1"):
+                fc1 = tf.layers.dense(drop_out9, n_fc1, activation=tf.nn.relu, name="fc1") 
+    
+            with tf.name_scope("output"):
+                logits = tf.layers.dense(fc1, n_outputs, name="output") 
+                outputs=logits
+                
+            with tf.name_scope("train"): 
+                xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=input_y)
+                loss = tf.reduce_mean(xentropy) 
+                optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 
+                training_op = optimizer.minimize(loss)
+                tf.summary.scalar('loss', loss)
+                merged = tf.summary.merge_all()
+        
+            with tf.name_scope("eval"):
+                correct = tf.nn.in_top_k(outputs, input_y, 1) 
+                accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+    
+            with tf.name_scope("init_and_save"):
+                init = tf.global_variables_initializer()
+                saver = tf.train.Saver()
+    
+        n_epochs = 100
+        batch_size = 128
+        
+        saved_acc=[]
+        valid=0;test=0;epoch_count=0;valid_Z=0;
+        
+        with tf.Session(graph=graph) as sess:
+            writer = tf.summary.FileWriter(folder_path+'/TB', sess.graph)
+            init.run()
+            for epoch in range(n_epochs):        
+                run_metadata = tf.RunMetadata()  
+                for X_batch, y_batch in self.shuffle_batch(X_train, _y_train, batch_size):  
+                    _, summary, loss_val = sess.run([training_op, merged, loss], feed_dict={input_X: X_batch, input_y: y_batch, keep_prob:0.7},run_metadata=run_metadata)
+                    writer.add_summary(summary, epoch)  
+                acc_batch = accuracy.eval(feed_dict={input_X: X_batch, input_y: y_batch, keep_prob:1.0})
+                X_test=X_test.reshape(-1, n_inputs_label, 1)
+                X_valid=X_valid.reshape(-1, n_inputs_label, 1)
+                acc_test = accuracy.eval(feed_dict={input_X: X_test, input_y: _y_test, keep_prob:1.0}) 
+                acc_valid = accuracy.eval(feed_dict={input_X: X_valid, input_y: _y_valid, keep_prob:1.0})  
+                
+                if acc_valid>valid:
+                    valid=acc_valid
+                    test=acc_test 
+                    Z=logits.eval(feed_dict={input_X:X_test, keep_prob:1.0})
+                    prob=Z
+                    y_pred=np.argmax(Z, axis=1) 
+                    test_prediction=y_pred
+                    valid_Z=logits.eval(feed_dict={input_X:X_valid, keep_prob:1.0})
+                    valid_y_pred=np.argmax(valid_Z, axis=1) 
+                    valid_prediction=valid_y_pred
+                saved_acc.append([acc_batch, acc_test])
+            
+            acc_test = accuracy.eval(feed_dict={input_X: X_test, input_y: _y_test, keep_prob:1.0})
+            print("best valid :", valid, " test sets:", test)
+    
+            save_path = saver.save(sess, folder_path+"/CNN_model.ckpt")
+            print("this model is saved to ",save_path) 
+        writer.close()
+        return valid_prediction, test_prediction, prob[:,1]
+    
+    def DNN_train2(self, X_train, y_train, X_test, y_test, X_valid, y_valid, n_inputs_label): 
+        self.reset_graph()
+        n_inputs = n_inputs_label
+        n_layers = 10
+        n_hidden1 = 100
+        n_outputs = 2 
+        
+        learning_rate = 0.001 
+        
+        n_epochs = 1000 
+        batch_size = 128
+        
+        saved_acc=[] 
+        
+        folder_path="DNN_model"
+        pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
+    
+        X = tf.placeholder(tf.float32, [None, n_inputs])
+        y = tf.placeholder(tf.int64, [None])
+        keep_prob = tf.placeholder(tf.float32)
+        training = tf.placeholder_with_default(False, shape=(), name='training')
+    
+        with tf.variable_scope("dnn"): 
+            for i in range(n_layers):
+                layer_name="hidden"+str(i)
+                if i==0:
+                    hidden = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name=layer_name)
+                else:
+                    hidden = tf.nn.dropout(hidden, keep_prob)
+                    hidden = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name=layer_name)
+                
+            logits = tf.layers.dense(hidden, n_outputs, name="outputs")
+        
+        with tf.variable_scope("loss"): 
+            crossentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
+            loss = tf.reduce_mean(crossentropy, name="loss")
+    
+        with tf.name_scope("train"): 
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+            training_op = optimizer.minimize(loss)
+    
+        with tf.name_scope("eval"): 
+            correct = tf.nn.in_top_k(logits, y, 1)
+            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+    
+        init = tf.global_variables_initializer() 
+        saver = tf.train.Saver() 
+    
+        epoch_count=0; valid=0; test=0; prob=0
+        with tf.Session() as sess:
+            init.run()
+            for epoch in range(n_epochs):
+                epoch_count=epoch_count+1
+                for X_batch, y_batch in self.batch(X_train, y_train, batch_size):
+                    sess.run(training_op, feed_dict={X: X_batch, y:y_batch, keep_prob:1.0, training:True}) 
+                acc_batch = accuracy.eval(feed_dict={X:X_batch, y:y_batch, keep_prob:1.0}) 
+                acc_test = accuracy.eval(feed_dict={X: X_test, y:y_test, keep_prob:1.0}) 
+                acc_valid = accuracy.eval(feed_dict={X: X_valid, y:y_valid, keep_prob:1.0}) 
+
+                if acc_valid>=valid:
+                    valid=acc_valid
+                    test=acc_test 
+                    Z=logits.eval(feed_dict={X:X_test, keep_prob:1.0})
+                    y_pred=np.argmax(Z, axis=1) 
+                    prob=Z
+                    test_prediction=y_pred
+                    valid_Z=logits.eval(feed_dict={X:X_valid, keep_prob:1.0})
+                    valid_y_pred=np.argmax(valid_Z, axis=1) 
+                    valid_prediction=valid_y_pred
+                saved_acc.append([acc_batch, acc_test])
+            save_path = saver.save(sess, folder_path+"/test.ckpt") 
+            print("DNN model is saved to :", save_path)
+            print("The best valid ", valid," test", test)
+            
+        return valid_prediction, test_prediction, prob[:,1]
+    
+    def multi_models_running(self, _X, _y, X_test, y_test):
+      total_prediction=[]; total_proba=[]
+      print("=================== LinearSVC")
+      grid = [
+            {'C': [0.01, 0.1, 1.0]}, 
+            ] 
+      clf = LinearSVC() 
+      grid_search = GridSearchCV(clf, grid, cv=None, scoring='accuracy', return_train_score=True)
+      grid_search.fit(_X, _y)
+      best_param=grid_search.best_params_
+      clf = LinearSVC(**best_param) 
+      clf.fit(_X, _y)
+      y_preds1=clf.predict(X_test) 
+      total_prediction.append(y_preds1) 
+      y_score2 = clf.decision_function(X_test)
+      total_proba.append(y_score2)
+    
+      print("=================== DT") 
+      grid = [
+        {'max_features': [128], 'max_depth':[3, 5], 'max_leaf_nodes':[3, 5]}, 
+      ]
+      clf = DecisionTreeClassifier() 
+      grid_search = GridSearchCV(clf, grid, cv=None, scoring='accuracy', return_train_score=True)
+      grid_search.fit(_X, _y)
+      best_param=grid_search.best_params_
+      clf = DecisionTreeClassifier(**best_param)   
+      clf.fit(_X, _y)
+      y_preds3=clf.predict(X_test)
+      total_prediction.append(y_preds3)
+     
+      if len(list(set(y_test)))>2:
+          y_score4 = clf.predict_proba(X_test)
+      else:
+          y_score4 = clf.predict_proba(X_test)[:,1] 
+    
+      total_proba.append(y_score4)
+      
+      print("=================== RF")
+      grid = [
+        {'n_estimators': [128, 256, 384], 'max_features': [128]}, 
+      ]
+      clf = RandomForestClassifier() 
+      grid_search = GridSearchCV(clf, grid, cv=None, scoring='accuracy', return_train_score=True)
+      grid_search.fit(_X, _y)
+      best_param=grid_search.best_params_
+      clf = RandomForestClassifier(**best_param)
+      clf.fit(_X, _y)
+      y_preds4=clf.predict(X_test)
+      total_prediction.append(y_preds4)
+    
+      if len(list(set(y_test)))>2:
+          y_score4 = clf.predict_proba(X_test)
+      else:
+          y_score4 = clf.predict_proba(X_test)[:,1] 
+    
+      total_proba.append(y_score4)
+     
+      print("=================== LR")
+      grid = [
+            {'C': [0.01, 0.1, 1.0]}, 
+            ] 
+      clf = LogisticRegression() 
+      grid_search = GridSearchCV(clf, grid, cv=None, scoring='accuracy', return_train_score=True)
+      grid_search.fit(_X, _y)
+      best_param=grid_search.best_params_
+      clf = LogisticRegression(**best_param)
+      clf.fit(_X, _y)
+      y_preds5=clf.predict(X_test) 
+       
+      if len(list(set(y_test)))>2:
+          y_score5 = clf.predict_proba(X_test)
+      else:
+          y_score5 = clf.predict_proba(X_test)[:,1]
+    
+      total_prediction.append(y_preds5)
+      total_proba.append(y_score5)
+      
+      return total_prediction, total_proba
+    
+    def shuffle_batch(self, X, y, batch_size):
+      rnd_idx = np.random.permutation(len(X))
+      n_batches = len(X) // batch_size  
+      for batch_idx in np.array_split(rnd_idx, n_batches):
+        X_batch, y_batch = X[batch_idx], y[batch_idx] 
+        yield X_batch, y_batch
+         
+    def dumpArrayFile(self, denseList, fileName):
+        np.asarray(denseList).dump(fileName + '.dat')
+    
+    def run_save(self, X_train, y_train, X_test, y_test, X_valid, y_valid, name, sampling, dimension, result_dir):
+        if sampling=='SMOTE':
+            oversample = SMOTE()
+        else:
+            oversample = ADASYN()
+            
+        Re_X_train, Re_y_train = oversample.fit_resample(X_train, y_train)
+        _, prediction, prob = self.DNN_train2(Re_X_train, Re_y_train, X_test, y_test, X_valid, y_valid, dimension)
+        total_prediction, total_prob = self.multi_models_running(Re_X_train, Re_y_train, X_test, y_test) 
+        total_prediction.extend([prediction])    
+        total_prob.extend([prob])
+        _, prediction, prob = self.CNN_train(Re_X_train, Re_y_train, X_test, y_test, X_valid, y_valid, dimension)
+        total_prediction.extend([prediction])
+        total_prob.extend([prob])
+        self.dumpArrayFile(total_prediction,result_dir+'/'+name) 
+        self.dumpArrayFile(total_prob,result_dir+'/'+'prob.'+name) 
+        return total_prediction, total_prob
+    
+    def save_label(self, y_test, name, result_dir):
+        self.dumpArrayFile(y_test,result_dir+'/'+name) 
+    
+
diff --git a/code/lib/Medline/__init__.py b/code/lib/Medline/__init__.py
new file mode 100644
index 0000000..572459c
--- /dev/null
+++ b/code/lib/Medline/__init__.py
@@ -0,0 +1,222 @@
+# Copyright 1999 by Jeffrey Chang.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Code to work with Medline from the NCBI.
+
+Classes:
+ - Record           A dictionary holding Medline data.
+
+Functions:
+ - read             Reads one Medline record
+ - parse            Allows you to iterate over a bunch of Medline records
+
+"""
+
+
+class Record(dict):
+    """A dictionary holding information from a Medline record.
+
+    All data are stored under the mnemonic appearing in the Medline
+    file. These mnemonics have the following interpretations:
+
+    ========= ==============================
+    Mnemonic  Description
+    --------- ------------------------------
+    AB        Abstract
+    CI        Copyright Information
+    AD        Affiliation
+    IRAD      Investigator Affiliation
+    AID       Article Identifier
+    AU        Author
+    FAU       Full Author
+    CN        Corporate Author
+    DCOM      Date Completed
+    DA        Date Created
+    LR        Date Last Revised
+    DEP       Date of Electronic Publication
+    DP        Date of Publication
+    EDAT      Entrez Date
+    GS        Gene Symbol
+    GN        General Note
+    GR        Grant Number
+    IR        Investigator Name
+    FIR       Full Investigator Name
+    IS        ISSN
+    IP        Issue
+    TA        Journal Title Abbreviation
+    JT        Journal Title
+    LA        Language
+    LID       Location Identifier
+    MID       Manuscript Identifier
+    MHDA      MeSH Date
+    MH        MeSH Terms
+    JID       NLM Unique ID
+    RF        Number of References
+    OAB       Other Abstract
+    OCI       Other Copyright Information
+    OID       Other ID
+    OT        Other Term
+    OTO       Other Term Owner
+    OWN       Owner
+    PG        Pagination
+    PS        Personal Name as Subject
+    FPS       Full Personal Name as Subject
+    PL        Place of Publication
+    PHST      Publication History Status
+    PST       Publication Status
+    PT        Publication Type
+    PUBM      Publishing Model
+    PMC       PubMed Central Identifier
+    PMID      PubMed Unique Identifier
+    RN        Registry Number/EC Number
+    NM        Substance Name
+    SI        Secondary Source ID
+    SO        Source
+    SFM       Space Flight Mission
+    STAT      Status
+    SB        Subset
+    TI        Title
+    TT        Transliterated Title
+    VI        Volume
+    CON       Comment on
+    CIN       Comment in
+    EIN       Erratum in
+    EFR       Erratum for
+    CRI       Corrected and Republished in
+    CRF       Corrected and Republished from
+    PRIN      Partial retraction in
+    PROF      Partial retraction of
+    RPI       Republished in
+    RPF       Republished from
+    RIN       Retraction in
+    ROF       Retraction of
+    UIN       Update in
+    UOF       Update of
+    SPIN      Summary for patients in
+    ORI       Original report in
+    ========= ==============================
+
+    """
+
+
+def parse(handle):
+    """Read Medline records one by one from the handle.
+
+    The handle is either is a Medline file, a file-like object, or a list
+    of lines describing one or more Medline records.
+
+    Typical usage::
+
+        >>> from Bio import Medline
+        >>> with open("Medline/pubmed_result2.txt") as handle:
+        ...     records = Medline.parse(handle)
+        ...     for record in records:
+        ...         print(record['TI'])
+        ...
+        A high level interface to SCOP and ASTRAL ...
+        GenomeDiagram: a python package for the visualization of ...
+        Open source clustering software.
+        PDB file parser and structure class implemented in Python.
+
+    """
+    # These keys point to string values
+    textkeys = (
+        "ID",
+        "PMID",
+        "SO",
+        "RF",
+        "NI",
+        "JC",
+        "TA",
+        "IS",
+        "CY",
+        "TT",
+        "CA",
+        "IP",
+        "VI",
+        "DP",
+        "YR",
+        "PG",
+        "LID",
+        "DA",
+        "LR",
+        "OWN",
+        "STAT",
+        "DCOM",
+        "PUBM",
+        "DEP",
+        "PL",
+        "JID",
+        "SB",
+        "PMC",
+        "EDAT",
+        "MHDA",
+        "PST",
+        "AB",
+        "EA",
+        "TI",
+        "JT",
+    )
+    handle = iter(handle)
+
+    key = ""
+    record = Record()
+    for line in handle:
+        line = line.rstrip()
+        if line[:6] == "      ":  # continuation line
+            if key in ["MH", "AD"]:
+                # Multi-line MESH term, want to append to last entry in list
+                record[key][-1] += line[5:]  # including space using line[5:]
+            else:
+                record[key].append(line[6:])
+        elif line:
+            key = line[:4].rstrip()
+            if key not in record:
+                record[key] = []
+            record[key].append(line[6:])
+        elif record:
+            # Join each list of strings into one string.
+            for key in record:
+                if key in textkeys:
+                    record[key] = " ".join(record[key])
+            yield record
+            record = Record()
+    if record:  # catch last one
+        for key in record:
+            if key in textkeys:
+                record[key] = " ".join(record[key])
+        yield record
+
+
+def read(handle):
+    """Read a single Medline record from the handle.
+
+    The handle is either is a Medline file, a file-like object, or a list
+    of lines describing a Medline record.
+
+    Typical usage:
+
+        >>> from Bio import Medline
+        >>> with open("Medline/pubmed_result1.txt") as handle:
+        ...     record = Medline.read(handle)
+        ...     print(record['TI'])
+        ...
+        The Bio* toolkits--a brief overview.
+
+    """
+    item=[]
+    records = parse(handle)
+    try:
+        while True:
+            item = next(records)
+    except StopIteration:
+        #item=[]
+        pass
+    finally:
+        del records
+    #return next(records)
+    return item
diff --git a/code/lib/Medline/__pycache__/__init__.cpython-311.pyc b/code/lib/Medline/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000..0dd7c25
Binary files /dev/null and b/code/lib/Medline/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/lib/Medline/__pycache__/__init__.cpython-312.pyc b/code/lib/Medline/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..3553acf
Binary files /dev/null and b/code/lib/Medline/__pycache__/__init__.cpython-312.pyc differ
diff --git a/code/lib/Medline/__pycache__/__init__.cpython-37.pyc b/code/lib/Medline/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..973d37e
Binary files /dev/null and b/code/lib/Medline/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Moon_gene2vec.py b/code/lib/Moon_gene2vec.py
new file mode 100644
index 0000000..edca73f
--- /dev/null
+++ b/code/lib/Moon_gene2vec.py
@@ -0,0 +1,369 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 29 18:30:00 2020
+
+@author: Jihye Moon
+
+"""
+import os 
+from os.path import isdir, join
+from pathlib import Path
+import pathlib
+import sys
+import numpy as np 
+class Gene2vec(): 
+    def __init__(self, data=None, dic=None):
+        self.data = data
+        if dic != None:
+            self.dic = dic
+        else:
+            self.dic = {}
+        self.count = {}
+        self.count['UNK']=-1
+        self.dictionary={}
+        
+    def data_processing(self, ):
+        
+        return 0
+        
+    def data_loading(self, dataload):
+        sent=[]; gene=[]
+        with open(dataload, 'r', encoding='UTF-8') as f:
+            buffer_data = f.readlines()
+            for lines in buffer_data:
+                stripped = lines.rstrip() 
+                stripeed = stripped.split('\t')
+                if len(stripeed)>1:
+                    buffer = "".join(stripeed[1]).split()
+                    self.vocab(buffer)
+                    sent.append(buffer)
+                    if stripeed[0]!=-1:
+                        gene.append(stripeed[0])
+        return sent, gene 
+    
+    def vocab(self, sent):
+        for word in sent:
+            if self.count.get(word, -1) != -1:
+                self.count[word] +=1
+            else:
+                self.count[word] = 1
+        #return self.count
+    def vocab_save(self, name, vocab, path=None):
+        if path == None:
+            vocab_dir =  '/tmp/vocab'
+            pathlib.Path(vocab_dir).mkdir(parents=True, exist_ok=True)
+        else:
+            vocab_dir = path + '/vocab'
+            pathlib.Path(vocab_dir).mkdir(parents=True, exist_ok=True)
+            
+        handle = open(os.path.join(vocab_dir, name+'.vocab.txt'), "w")
+        for k, v in vocab.items():
+            handle.write(k+'\t'+str(v)+'\n')
+        handle.close()
+    def vocab_output(self): 
+        vocab_dir = '/tmp/vocab'
+        pathlib.Path(vocab_dir).mkdir(parents=True, exist_ok=True)
+        count ={k: v for k, v in sorted(self.count.items(), key=lambda item: item[1])}
+        #handle = open(os.path.join(preprocessed_dir, 'vocab.txt'), "w")
+        handle = open(vocab_dir + 'vocab.txt', "w")
+        for k, v in count.items():
+            handle.write(k+'\t'+str(v)+'\n')
+        handle.close()
+        #handle.close()
+        #x={v: k for k, values in vocab.items() for v in values}
+        return count
+    def vocab_import(self): 
+        count ={k: v for k, v in sorted(self.count.items(), key=lambda item: item[1])} 
+        x={v: k for k, values in count.items() for v in values}
+        return count, x
+    def selecting_vocab(self, data, appearance, min_size=1):
+        if data == None:
+            dic = self.dictionary
+        else:
+            dic = data
+        new_dic = {};
+        #dic_num = len(dic)
+        dic_num = 0
+        for word in self.count: 
+            if self.count[word]>appearance and len(word)>min_size: #after final
+            #if self.count[word]>appearance and len(word)>2:
+                new_dic[word] = dic_num
+                dic_num +=1
+        return new_dic
+    
+    def data_combine(self, sent):
+        sentences=[] 
+
+        for i in range(len(sent)):
+            sentences.append("".join(sent[i]).split())  
+            
+    def normal_dic(self, ):
+        
+        
+        return 0
+    
+    def gene_dic(self, gene, dictionary): 
+        dic = dictionary.copy()
+        gene_list=list(set(gene))
+        for gn in gene_list:    
+            if dic.get(gn, -1) == -1:
+                dic[gn]=len(dic)#+1 
+                
+        reversed_dic = dict(zip(dic.values(), dic.keys()))
+        self.dic= dic
+        self.gene_reverse_dict = reversed_dic
+        return dic, reversed_dic 
+    
+    def model_training(self, ):
+        return 0
+    
+    def sent2idx(self, sent):
+        sents=[]
+        for j in range(len(sent)):
+            if self.dic.get(sent[j],-1) != -1:
+                sents.append(self.dic[sent[j]])
+        return sents
+
+    def idx2sent(self, sent):
+        sents=[]
+        for j in range(len(sent)):
+            if self.gene_reverse_dict.get(sent[j],-1) != -1:
+                sents.append(self.gene_reverse_dict[sent[j]])
+        return sents
+ 
+    def gene2associated_skip_gram(self, sents, gene, sents_count, window_size): 
+        from itertools import compress
+        saveD=[]
+        preprocessing=self.sent2idx(sents)
+        gene_index=gene
+        prelen=len(preprocessing)
+        x=[]    
+        for k in range(prelen):
+            if self.gene_reverse_dict.get(preprocessing[k],-1)!=-1:
+                x.extend([preprocessing[k]])
+        value=0
+        for i in range(len(x)):
+            index=i
+            buffer=0
+            last=0
+            if i-1: 
+                buffer=2+buffer    
+                value=x[0:index+window_size+1]
+            elif i< (len(x)-window_size) and i>len(x): 
+                last=1+last
+                value=x[index-window_size:index-last]
+            else: 
+                value=x[index-window_size:index+window_size+1]
+            saveD.append(value)
+        ix=0
+        data_index=0
+        sz=sum([len(saveD[j])-1 for j in range(len(saveD))])+0
+        #print('sentence_sum',sz)
+        batch = np.zeros(shape=(sz), dtype=np.int32)*-1
+        labels = np.zeros(shape=(sz), dtype=np.int32)*-1
+        span = 2 * window_size + 1  
+        size_counting=0
+        size=[]
+        total_size=[]
+        if data_index + span > len(x):
+            data_index = 0
+        for i in range(len(saveD)):
+            buffer = saveD[i] 
+            data_index += span
+            context_words = [w for w in buffer] 
+            words_to_use = context_words
+            for j in range(len(context_words)-1):
+                batch[ix+j]=gene_index #x[i] 
+                labels[ix + j] = context_words[j]
+                size_counting+=1
+            for k in range(size_counting):
+                size.append(size_counting)
+            total_size.extend(size)
+            size=[]
+            size_counting=0
+            ix=ix+len(buffer)-1 
+        return batch, labels, total_size 
+    def gene2doc_batch_fucntion(self, sents, gene, sents_count, window_size): 
+        from itertools import compress
+        saveD=[]
+        preprocessing=self.sent2idx(sents)
+        gene_index=gene
+        prelen=len(preprocessing)
+        x=[]    
+        for k in range(prelen):
+            if self.gene_reverse_dict.get(preprocessing[k],-1)!=-1:
+                x.extend([preprocessing[k]])
+        value=0
+        for i in range(len(x)):
+            index=i
+            buffer=0
+            last=0
+            if i-1: 
+                buffer=2+buffer    
+                value=x[0:index+window_size+1]
+            elif i< (len(x)-window_size) and i>len(x): 
+                last=1+last
+                value=x[index-window_size:index-last]
+            else: 
+                value=x[index-window_size:index+window_size+1]
+            saveD.append(value)
+        ix=0
+        data_index=0
+        sz=sum([len(saveD[j])-1 for j in range(len(saveD))])+0
+        #print('sentence_sum',sz)
+        batch = np.zeros(shape=(sz), dtype=np.int32)*-1
+        labels = np.zeros(shape=(sz), dtype=np.int32)*-1
+        span = 2 * window_size + 1  
+        if data_index + span > len(x):
+            data_index = 0
+        for i in range(len(saveD)):
+            buffer = saveD[i] 
+            data_index += span
+            context_words = [w for w in buffer if w != x[i]] 
+            words_to_use = context_words
+            for j, context_word in enumerate(words_to_use):
+                batch[ix+j]=x[i] 
+                labels[ix + j] = context_word
+            ix=ix+len(buffer)-1 
+        return batch, labels 
+     
+    def gene_associated(self, sents, gene, sents_count, window_size): 
+        from itertools import compress
+        saveD=[]
+        preprocessing=self.sent2idx(sents)
+        gene_index=gene
+        prelen=len(preprocessing)
+        x=[]    
+        for k in range(prelen):
+            if self.gene_reverse_dict.get(preprocessing[k],-1)!=-1:
+                x.extend([preprocessing[k]])
+        value=0
+        for i in range(len(x)):
+            index=i
+            buffer=0
+            last=0
+            if i-1: 
+                buffer=2+buffer    
+                value=x[0:index+window_size+1]
+            elif i< (len(x)-window_size) and i>len(x): 
+                last=1+last
+                value=x[index-window_size:index-last]
+            else: 
+                value=x[index-window_size:index+window_size+1]
+            saveD.append(value)
+        ix=0
+        data_index=0
+        sz=sum([len(saveD[j])-1 for j in range(len(saveD))])+0
+        #print('sentence_sum',sz)
+        batch = np.zeros(shape=(sz), dtype=np.int32)*-1
+        labels = np.zeros(shape=(sz), dtype=np.int32)*-1
+        span = 2 * window_size + 1  
+        if data_index + span > len(x):
+            data_index = 0
+        for i in range(len(saveD)):
+            buffer = saveD[i] 
+            data_index += span
+            context_words = [w for w in buffer if w != x[i]] 
+            words_to_use = context_words
+            for j, context_word in enumerate(words_to_use):
+                batch[ix+j]=x[i] 
+                labels[ix + j] = context_word
+            ix=ix+len(buffer)-1 
+        return batch, labels 
+    def gene_associated_old2(self, sents, gene_dict2, gene_name, additing_quuery, sents_count, window_size):
+        saveD=[]
+        preprocessing=self.sent2idx(sents)
+        gene_index=gene=gene_dict2[gene_name]
+        associated = additing_quuery[gene_name].split()
+        associated_index = []
+        for i in range(len(associated)):
+            associated_index.append(gene_dict2[associated[i]])
+        
+        prelen=len(preprocessing)
+        x=[]    
+        for k in range(prelen):
+            if self.gene_reverse_dict.get(preprocessing[k],-1)!=-1:
+                x.extend([preprocessing[k]])
+        value=0
+        for i in range(len(x)):
+            index=i
+            buffer=0
+            last=0
+            if i-1: 
+                buffer=2+buffer    
+                value=x[0:index+window_size+1]
+            elif i< (len(x)-window_size) and i>len(x): 
+                last=1+last
+                value=x[index-window_size:index-last]
+            else: 
+                value=x[index-window_size:index+window_size+1]
+            saveD.append(value)
+        ix=0
+        data_index=0
+        #sz=sum([len(saveD[j])-1 for j in range(len(saveD))])+1
+        #print('sentence_sum',sz)
+        batch = np.zeros(shape=(prelen, len(associated_index)), dtype=np.int32)*-1
+        labels = np.zeros(shape=(prelen), dtype=np.int32)*-1
+        span = 2 * window_size + 1  
+        if data_index + span > len(x):
+            data_index = 0
+        for j, context_word in enumerate(preprocessing):
+            batch[j]=associated_index
+            #print(j, context_word) 
+            labels[j] = context_word
+            #ix=ix+len(buffer)-1 
+        return batch, labels
+    def gene_insert(self, batch, label, gene):
+        buffer=0; saved_insert=[]; saved_value=[]
+        for i in range(len(batch)):
+            if i>0:        
+                if buffer!=batch[i]:
+                    saved_insert.append(i)
+                    saved_value.append(buffer)
+            buffer=batch[i]
+    
+        for i in range(len(saved_value)):
+            batch=np.insert(batch, saved_insert[i]+i, saved_value[i]) 
+        
+        batch=np.insert(batch, len(batch), buffer) 
+        label=np.insert(label, saved_insert, gene) 
+        label=np.insert(label, len(label), gene) 
+        return batch, label
+    def gene_additing(self, sents, gene, sents_count, window_size):
+        saveD=[]
+        preprocessing=self.sent2idx(sents)
+        gene_index=gene
+        prelen=len(preprocessing)
+        x=[]    
+        for k in range(prelen):
+            if self.gene_reverse_dict.get(preprocessing[k],-1)!=-1:
+                x.extend([preprocessing[k]])
+        value=0
+        for i in range(len(x)):
+            index=i
+            buffer=0
+            last=0
+            if i-1: 
+                buffer=2+buffer    
+                value=x[0:index+window_size+1]
+            elif i< (len(x)-window_size) and i>len(x): 
+                last=1+last
+                value=x[index-window_size:index-last]
+            else: 
+                value=x[index-window_size:index+window_size+1]
+            saveD.append(value)
+        ix=0
+        data_index=0
+        #sz=sum([len(saveD[j])-1 for j in range(len(saveD))])+1
+        #print('sentence_sum',sz)
+        batch = np.zeros(shape=(prelen), dtype=np.int32)*-1
+        labels = np.zeros(shape=(prelen), dtype=np.int32)*-1
+        span = 2 * window_size + 1  
+        if data_index + span > len(x):
+            data_index = 0
+        for j, context_word in enumerate(preprocessing):
+            batch[j]=gene
+            #print(j, context_word) 
+            labels[j] = context_word
+            #ix=ix+len(buffer)-1 
+        return batch, labels
diff --git a/code/lib/__pycache__/Building_Literature_Embedding_Model.cpython-37.pyc b/code/lib/__pycache__/Building_Literature_Embedding_Model.cpython-37.pyc
new file mode 100644
index 0000000..dcc7573
Binary files /dev/null and b/code/lib/__pycache__/Building_Literature_Embedding_Model.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/CVD_risk_factor_search.cpython-37.pyc b/code/lib/__pycache__/CVD_risk_factor_search.cpython-37.pyc
new file mode 100644
index 0000000..12186a3
Binary files /dev/null and b/code/lib/__pycache__/CVD_risk_factor_search.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/Intrisic_Evaluation.cpython-37.pyc b/code/lib/__pycache__/Intrisic_Evaluation.cpython-37.pyc
new file mode 100644
index 0000000..5548fa1
Binary files /dev/null and b/code/lib/__pycache__/Intrisic_Evaluation.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/Literature_Data_Collection.cpython-311.pyc b/code/lib/__pycache__/Literature_Data_Collection.cpython-311.pyc
new file mode 100644
index 0000000..ebba28f
Binary files /dev/null and b/code/lib/__pycache__/Literature_Data_Collection.cpython-311.pyc differ
diff --git a/code/lib/__pycache__/Literature_Data_Collection.cpython-312.pyc b/code/lib/__pycache__/Literature_Data_Collection.cpython-312.pyc
new file mode 100644
index 0000000..d005255
Binary files /dev/null and b/code/lib/__pycache__/Literature_Data_Collection.cpython-312.pyc differ
diff --git a/code/lib/__pycache__/Literature_Data_Collection.cpython-37.pyc b/code/lib/__pycache__/Literature_Data_Collection.cpython-37.pyc
new file mode 100644
index 0000000..d578bae
Binary files /dev/null and b/code/lib/__pycache__/Literature_Data_Collection.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-312.pyc b/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-312.pyc
new file mode 100644
index 0000000..c80249a
Binary files /dev/null and b/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-312.pyc differ
diff --git a/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-37.pyc b/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-37.pyc
new file mode 100644
index 0000000..e7ceaea
Binary files /dev/null and b/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
new file mode 100644
index 0000000..2d733ca
Binary files /dev/null and b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc differ
diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-312.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-312.pyc
new file mode 100644
index 0000000..3b459af
Binary files /dev/null and b/code/lib/__pycache__/Loading_PudMed.cpython-312.pyc differ
diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-37.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-37.pyc
new file mode 100644
index 0000000..54ecf25
Binary files /dev/null and b/code/lib/__pycache__/Loading_PudMed.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/ML_models.cpython-37.pyc b/code/lib/__pycache__/ML_models.cpython-37.pyc
new file mode 100644
index 0000000..3b60654
Binary files /dev/null and b/code/lib/__pycache__/ML_models.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/Moon_gene2vec.cpython-37.pyc b/code/lib/__pycache__/Moon_gene2vec.cpython-37.pyc
new file mode 100644
index 0000000..3917397
Binary files /dev/null and b/code/lib/__pycache__/Moon_gene2vec.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/loading_literature_embedding.cpython-37.pyc b/code/lib/__pycache__/loading_literature_embedding.cpython-37.pyc
new file mode 100644
index 0000000..3dd8738
Binary files /dev/null and b/code/lib/__pycache__/loading_literature_embedding.cpython-37.pyc differ
diff --git a/code/lib/__pycache__/step4_CVD_risk_factor_search.cpython-37.pyc b/code/lib/__pycache__/step4_CVD_risk_factor_search.cpython-37.pyc
new file mode 100644
index 0000000..81e7b03
Binary files /dev/null and b/code/lib/__pycache__/step4_CVD_risk_factor_search.cpython-37.pyc differ
diff --git a/code/lib/loading_literature_embedding.py b/code/lib/loading_literature_embedding.py
new file mode 100644
index 0000000..e7c36b0
--- /dev/null
+++ b/code/lib/loading_literature_embedding.py
@@ -0,0 +1,228 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Apr  1 16:48:48 2019
+
+@author: Jihye Moon
+""" 
+import numpy as np
+import os 
+import tensorflow as tf  
+ 
+class embedding_vector(): 
+    def text_open(self,path):
+        with open(path, 'r') as f:
+            data=f.read().strip().split('\n')
+        return data
+    
+    def data_split(self, key):
+        return key.split('#')
+    
+    def setting(self,path, gene_symb):
+        sess = tf.Session()
+        word2index = {} 
+        index2word=np.load(os.path.join(path, "name.dat"), allow_pickle=True)
+        saver = tf.train.import_meta_graph(os.path.join(path, "model.ckpt.meta"))
+        saver.restore(sess, (os.path.join(path, "model.ckpt")))
+        out_matrix=sess.run('nce_w:0') 
+        in_matrix=sess.run('embed1:0') 
+         
+        index2word=index2word.tolist()
+        
+        words_list = dict(zip(index2word.values(), index2word.keys()))
+        for i in range(len(index2word)):
+            word2index[index2word[i]] = i 
+            
+        self.index2word= dict(zip(word2index.values(), word2index.keys()))
+        self.word2index=word2index
+        self.syn0norm = np.array( [v/n for v, n in zip(in_matrix, np.linalg.norm(in_matrix, ord=2, axis=1))] )
+        self.syn1norm = np.array( [v/n for v, n in zip(out_matrix, np.linalg.norm(out_matrix, ord=2, axis=1))] )
+        
+        query_symbol=self.text_open(gene_symb+'.txt') #'../gene_name_info/query_symbol.txt'
+        
+        self.symble2name = {}
+        for i in range(len(query_symbol)):
+            self.symble2name[query_symbol[i]]=i
+
+        return words_list, index2word, self.syn0norm, self.syn1norm
+    
+    def filtering(self, word, scores):
+        word_x, word_y = word, word
+        unique_set=[]
+        non_single=[]
+        for x in word_x:
+            for y in word_y:
+                if x +'s' == y:
+                    non_single.append(y)
+                elif x +'es' == y:
+                    non_single.append(y) 
+        unique_set=list(set(non_single)) 
+        re_word=[]; re_score=[]
+        for i in range(len(word)):
+            if word[i] in unique_set:
+                continue
+            else:
+                re_word.append(word[i])
+                re_score.append(scores[i])
+        return re_word, re_score
+        
+    def compute_cosine_similarity(self,x,y):
+        return (np.dot(x,y)/(np.linalg.norm(x,2)*np.linalg.norm(y,2)))
+
+    def get_simwords(self,vec, matrix, TOPNUM):
+        sim_list = np.dot(matrix, vec.T)
+        word_sim_list = [ (s,w) for s, w in zip(sim_list, self.index2word)]
+        word_sim_list.sort(reverse=True)
+        print(TOPNUM)
+        return [ (v[1],v[0]) for v in word_sim_list[:TOPNUM]]
+
+    def get_simgenes(self,vec, matrix, TOPNUM):
+        symble2name=self.symble2name
+        sim_list = np.dot(matrix, vec.T)
+        word_sim_list = [ (s,w) for s, w in zip(sim_list, self.index2word)]
+        word_sim_list.sort(reverse=True)
+        count = 0
+        results = []
+        for v in word_sim_list:  
+            if symble2name.get(self.index2word[v[1]].replace('#',''),-1) != -1:
+                results.append((v[1],v[0]))
+                if count==TOPNUM:
+                    break;
+                count+=1
+        return results 
+
+    def print_sim_result(self,result, query, output): 
+        scores=[]; word=[]
+        for w, s in result: 
+            word.append(self.index2word[w])
+            scores.append(s)
+            
+        w, s = self.filtering(word, scores)
+        word=[]
+        for i in range(len(w)):
+            if w[i] not in query:
+                print("\t",w[i], s[i]) 
+                word.append(str(w[i])+' '+str(s[i])) 
+            self.logs(output+' '.join(query), word)
+        return None
+    
+    def type_similarity_display(self, output, TOPNUM):
+        kw=''  
+        while kw!='0': 
+            kw = input("query word (exit: 0): ")
+            datatype= 'm' 
+            keyword=kw 
+            keywords = keyword.split(" ") 
+            
+            if datatype=='m':
+                index_keywords = [self.word2index.get(k,0) for k in keywords] 
+                buffer_index_keywords=index_keywords.copy()
+                index_keywords=[]
+                print("==== Available Words (In-of-vocabulary):")
+                for ix in buffer_index_keywords:
+                    if ix!=0:
+                        index_keywords.append(ix)
+                        print(self.index2word.get(ix,0))
+                if index_keywords ==[]:
+                    print("There are no available words. Try different queries! ")
+                elif index_keywords !=[]:
+                    vec_keyword = np.mean([self.syn0norm[ki] for ki in index_keywords], axis=0)
+                    
+                    print ("=== Intrinsic Evaludation: Words ")
+                    result_inin = self.get_simwords(vec_keyword, self.syn0norm, TOPNUM)
+                    _ = self.print_sim_result(result_inin, keywords, output)
+                    
+                    print ("========")
+                    print ("=== Intrinsic Evaludation: Gene Names")
+                    result_inin = self.get_simgenes(vec_keyword, self.syn0norm, TOPNUM)
+                    _ = self.print_sim_result(result_inin, keywords, output)
+
+            else:
+                print("Type Correctly")
+                continue; 
+
+    def similarity_display(self, kw, output, TOPNUM):
+        if kw!='0':  
+            keyword=kw 
+            keywords = keyword.split(" ") 
+            
+            index_keywords = [self.word2index.get(k,0) for k in keywords]
+ 
+            buffer_index_keywords=index_keywords.copy()
+            index_keywords=[]
+            print("==== Available Words (In-of-vocabulary):")
+            for ix in buffer_index_keywords:
+                if ix!=0:
+                    index_keywords.append(ix)
+                    print(self.index2word.get(ix,0))
+            if index_keywords ==[]:
+                print("There are no available words. Try different queries! ")
+            elif index_keywords !=[]:
+                vec_keyword = np.mean([self.syn0norm[ki] for ki in index_keywords], axis=0)
+                    
+                print ("=== Intrinsic Evaludation: Words ")
+                result_inin = self.get_simwords(vec_keyword, self.syn0norm, TOPNUM)
+                _ = self.print_sim_result(result_inin, keywords, output+'/word_')
+                    
+                print ("========")
+                print ("=== Intrinsic Evaludation: Gene Names")
+                result_inin = self.get_simgenes(vec_keyword, self.syn0norm, TOPNUM)
+                _ = self.print_sim_result(result_inin, keywords, output+'/gene_')
+ 
+    def variable2embed(self, words_list, syn0norm, variables_index, additional_dictionary):
+        variables_lists = list(variables_index.keys())
+        buffer_embedding = []
+        embedding=[]
+        removal = []
+        embedding_list = {}
+        index2variables = {}
+        removed_words=[]
+        for i in range(len(variables_lists)):
+            buffer_embedding=[]
+            words = variables_index[variables_lists[i]]
+            words = words.split()
+            for w in words:
+                if words_list.get(w, -2)!=-2:
+                    buffer_embedding.append(syn0norm[words_list[w]]) 
+                else:
+                    removed_words.append(w)
+                if additional_dictionary.get(w, -2)!=-2:
+                    buffer_embedding.append(syn0norm[words_list[additional_dictionary[w]]]) 
+            if buffer_embedding==[]:
+                removal.append(variables_lists[i])
+            else:
+                embedding.append(np.mean(buffer_embedding, axis=0))
+                embedding_list[variables_lists[i]] = i
+                index2variables[i] = variables_lists[i]
+        self.index2variables=index2variables
+        return embedding_list, index2variables, embedding, removal, removed_words
+     
+    def get_simvariables(self, vec, matrix, index2variables, TOPNUM):
+        sim_list = np.dot(matrix, vec.T)
+        word_sim_list = [ (s,w) for s, w in zip(sim_list, index2variables)]
+        word_sim_list.sort(reverse=True)
+        return [ (v[1],v[0]) for v in word_sim_list[:TOPNUM]]
+
+    def logs(self, path, word):
+        f = open(path+'_logs.txt','w') 
+        for w in word:
+            f.write('{}\n'.format(w))
+        f.close()
+        
+    def target2variable(self, words, key_embedding, wordlist, embedding, embedding_list, index2variables,variables_indexing, TOPNUM): # variables to variables
+        buffer = words.split(' ') 
+        if len(buffer)==1:
+            vec_keyword = key_embedding[wordlist[words]]
+        else:
+            vec_keyword = []
+            for i in range(len(buffer)):
+                if wordlist.get(buffer[i], -1)!=-1:
+                    vec_keyword.append(key_embedding[wordlist[buffer[i]]])
+            vec_keyword = np.array(vec_keyword)
+            vec_keyword = np.mean(vec_keyword,axis=0)
+        result_inin = self.get_simvariables(vec_keyword, embedding, index2variables, TOPNUM)  
+        data = ''; name = []
+        for w, s in result_inin: 
+            data=data+index2variables[w]+' '
+            name.append(index2variables[w]) 
+        return data, name  
+    
\ No newline at end of file
diff --git a/code/lib/performance_metrics.py b/code/lib/performance_metrics.py
new file mode 100644
index 0000000..0319e5c
--- /dev/null
+++ b/code/lib/performance_metrics.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Dec 10 14:17:51 2021
+
+@author: Jihye Moon
+"""
+import pandas as pd
+import numpy as np
+import os
+import scipy.stats as st
+
+from imblearn.metrics import sensitivity_score, specificity_score
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
+
+class performance_metrics(): 
+    def saving_5folded_results(self, data, label):
+        total=[]
+        for i in range(len(data)):
+            total.append(self.metric(data[i],label))
+        return pd.DataFrame(total)
+    
+    def metric(self, y_pred,y_true): 
+        A = accuracy_score(y_true,y_pred)
+        R2 = recall_score(y_true,y_pred, average='macro')
+        F2 = f1_score(y_true,y_pred, average='macro')
+        P2 = precision_score(y_true,y_pred, average='macro') 
+    
+        SS = sensitivity_score(y_true,y_pred)
+        SP = specificity_score(y_true,y_pred)
+    
+        return [A, P2, R2, F2, SS, SP] 
+    
+    def averaged_results(self, N, path, data_name):
+        data={}
+        for i in range(N):
+            label = np.load(os.path.join(path, 'CVD_label.dat'), allow_pickle=True)
+            data[i] = self.saving_5folded_results(np.load(os.path.join(path, data_name+'.dat'), allow_pickle=True),label)     
+        name=['accuracy', 'macro-precision', 'macro-recall', 'macro-f1', 'sensitivity', 'specificity']
+        for k in range(len(name)):
+            print("=============== ", name[k] , "=================== ")
+            print(self.ci2(data_name, data,k)) 
+    
+    def direct_averaged_results(self, N, data_name, labels, total_result):
+        data={}
+        for i in range(N):
+            label = labels[i]
+            data[i] = self.saving_5folded_results(total_result[i],label)     
+        name=['accuracy', 'macro-precision', 'macro-recall', 'macro-f1', 'sensitivity', 'specificity']
+        for k in range(len(name)):
+            print("=============== ", name[k] , "=================== ")
+            print(self.ci2(data_name, data,k))
+            
+    def ci2(self, name, _data, k): # 95% CI with averaged results
+        print('=== ', name, ' ===')
+        data=pd.concat([_data[0][k],_data[1][k],_data[2][k],_data[3][k],_data[4][k]])#.groupby(level=0))
+        all_size=int(len(data)/max(data.index.tolist()))
+        all_size=max(data.index.tolist())+1
+        for az in range(all_size): 
+            print(" ", str(round(np.mean(data.loc[az]),2))+' ('+str(round(st.t.interval(alpha=0.95, df=len(data.loc[az])-1, loc=np.mean(data.loc[az]), scale=st.sem(data.loc[az]))[0],2))+', '+str(round(st.t.interval(alpha=0.95, df=len(data.loc[az])-1, loc=np.mean(data.loc[az]), scale=st.sem(data.loc[az]))[1],2 ))+')')
+        return '=================== '
diff --git a/code/read_me_images/model1_re.jpg b/code/read_me_images/model1_re.jpg
new file mode 100644
index 0000000..15f5470
Binary files /dev/null and b/code/read_me_images/model1_re.jpg differ
diff --git a/code/read_me_images/model2_re.jpg b/code/read_me_images/model2_re.jpg
new file mode 100644
index 0000000..8fe1a1d
Binary files /dev/null and b/code/read_me_images/model2_re.jpg differ
diff --git a/code/read_me_images/model3.png b/code/read_me_images/model3.png
new file mode 100644
index 0000000..12c50b9
Binary files /dev/null and b/code/read_me_images/model3.png differ
diff --git a/code/read_me_images/model3_re.jpg b/code/read_me_images/model3_re.jpg
new file mode 100644
index 0000000..bd68752
Binary files /dev/null and b/code/read_me_images/model3_re.jpg differ
diff --git a/code/read_me_images/preprocessing.png b/code/read_me_images/preprocessing.png
new file mode 100644
index 0000000..72d0d5d
Binary files /dev/null and b/code/read_me_images/preprocessing.png differ
diff --git a/code/read_me_images/table_collection.png b/code/read_me_images/table_collection.png
new file mode 100644
index 0000000..4d79459
Binary files /dev/null and b/code/read_me_images/table_collection.png differ
diff --git a/code/read_me_images/table_data.png b/code/read_me_images/table_data.png
new file mode 100644
index 0000000..4c2c79c
Binary files /dev/null and b/code/read_me_images/table_data.png differ
diff --git a/code/read_me_images/table_pre.png b/code/read_me_images/table_pre.png
new file mode 100644
index 0000000..88cf2d2
Binary files /dev/null and b/code/read_me_images/table_pre.png differ
diff --git a/code/run b/code/run
new file mode 100644
index 0000000..8a05a49
--- /dev/null
+++ b/code/run
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+#!/bin/sh
+
+if [ $1 == 'demo_a' ]; then
+  echo ' -- Running all demos'
+  echo ' -- Demo a -- '
+  EMBEDDING_PATH='../data/old_model'
+  STEP4_OUTPUT_PATH='../results/demo_a_CVD_searches' 
+  python -u step4_CVD_risk_factor_identification.py $EMBEDDING_PATH $STEP4_OUTPUT_PATH
+  
+elif [ $1 == 'demo_b' ]; then
+  echo ' -- Demo b -- ' 
+  QUERY_WORD='heart'
+  NUM_WORD_BASED_DATA=500000
+  NUM_GENE_BASED_DATA=100
+  BASE_PATH='../results/'
+  DATA_COLLECTION_PATH='../results/demo_b'
+  PREPROCESSEING_PATH='../results/demo_b' 
+  EMBEDDING_PATH='../results/demo_b_model'
+  EPOCH=2 # setting the number of ecoch for literature embedding model
+  STEP4_OUTPUT_PATH='../results/demo_b_CVD_searches'
+
+  python -u step1_data_collection.py $QUERY_WORD $NUM_WORD_BASED_DATA $NUM_GENE_BASED_DATA $DATA_COLLECTION_PATH
+  python -u step2_data_preprocessing.py $DATA_COLLECTION_PATH $PREPROCESSEING_PATH
+  python -u step3_literature_embedding_training.py $PREPROCESSEING_PATH $EPOCH $EMBEDDING_PATH
+  python -u step4_CVD_risk_factor_identification.py $EMBEDDING_PATH $STEP4_OUTPUT_PATH
+
+elif [ $1 == 'demo_r' ]; then
+  echo ' -- Running reproduction demo' 
+  echo ' -- 1) CVD risk factor search using pre-trained model '
+  EMBEDDING_PATH='../data/old_model'
+  STEP4_OUTPUT_PATH='../results/demo_original_CVD_searches' 
+  python -u step4_CVD_risk_factor_identification.py $EMBEDDING_PATH $STEP4_OUTPUT_PATH
+  
+  echo '-- 2) Literature model training and evaluation using collected literature data '
+  PREPROCESSEING_PATH='../data/old_preprocessed_data'
+  EPOCH=1 # Setting the number of ecoch for literature embedding model. In original paper, we used EPOCH=10
+  EMBEDDING_PATH='../results/demo_new_model'
+  STEP4_OUTPUT_PATH='../results/demo_new_CVD_searches'
+  python -u step3_literature_embedding_training.py $PREPROCESSEING_PATH $EPOCH $EMBEDDING_PATH
+  python -u step4_CVD_risk_factor_identification.py $EMBEDDING_PATH $STEP4_OUTPUT_PATH
+
+else 
+  echo ' -- Defualt'
+  echo ' -- Demo a -- '
+  EMBEDDING_PATH='../data/old_model'
+  STEP4_OUTPUT_PATH='../results/demo_a_CVD_searches' 
+  python -u step4_CVD_risk_factor_identification.py $EMBEDDING_PATH $STEP4_OUTPUT_PATH
+fi
+
diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py
new file mode 100644
index 0000000..16c75e8
--- /dev/null
+++ b/code/step1_data_collection.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jun 21 00:16:25 2020
+
+python -u "/mnt/c/Users/lrm22005/OneDrive - University of Connecticut/Research/ZIP11_Bioinformatic/capsule-3642152/code/step1_data_collection.py" 'zinc' 0 0 './results/zinc'
+"""
+
+import os
+import pathlib 
+import sys
+
+sys.path.append('lib')  
+from lib.Literature_Data_Collection import literature_data_collection
+
+years = 15
+
+if len(sys.argv)>3:
+    word_query = str(sys.argv[1])
+    word_end_point = int(sys.argv[2]) # the endpoint of a word-based data collection. for demo-b 100000
+    gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection for demo-b 50
+    paths = str(sys.argv[4]) + '/'
+elif len(sys.argv)==3:
+    word_query = str(sys.argv[1])
+    paths = str(sys.argv[2]) + '/'
+     
+data_dir = os.path.abspath(os.getcwd())
+output_dir = os.path.join(data_dir, paths + 'baseline_doc')
+document_output_dir = os.path.join(data_dir, paths + 'gene2document') 
+pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
+pathlib.Path(document_output_dir).mkdir(parents=True, exist_ok=True)
+email = "lrmercadod@gmail.com"  # Replace with your valid email address
+api_key = "19bea34a4dbdbc6ef30392cee15943365309"
+ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)
+
+########### word query based literature data collection ################# 
+gap=10000
+batch = 1000
+w2d_starting_point = 0  
+
+search_results, _word_end_point = ld.word_based_query_fit(year = years, user_term=word_query)
+print('The number of avaliable abstracts :', _word_end_point, 'for ', word_query) 
+
+if int(sys.argv[2])==0:
+    word_end_point = _word_end_point
+ld.collecting_doc_using_word_based_query(year = years, user_term=word_query, gap = gap, starting = gap*w2d_starting_point, ixs = w2d_starting_point, test_end_point=word_end_point)
+
+########### gene name-query based literature data collection ################# 
+query_full=ld.text_open('./data/gene_name_info/query_full_name.txt')
+query_symbol=ld.text_open('./data/gene_name_info/query_symbol.txt') # gene name list
+
+query_size = len(query_full)
+ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up
+
+g2d_starting_point = 0 
+batch_size = 100
+#############################
+#####################
+gene_end_point = round(query_size/batch_size)
+
+if len(sys.argv)>2:
+    gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection 
+if int(sys.argv[3])==0:
+    gene_end_point = round(query_size/batch_size)
+ld.collecting_doc_using_gene_based_query(year = years, batch_size = batch_size, starting = g2d_starting_point, query_len=len(query_full), end_point = gene_end_point)
diff --git a/code/step1_data_collection_Custom_Luis.py b/code/step1_data_collection_Custom_Luis.py
new file mode 100644
index 0000000..6f86893
--- /dev/null
+++ b/code/step1_data_collection_Custom_Luis.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jun 21 00:16:25 2020
+python -u "/mnt/c/Users/lrm22005/OneDrive - University of Connecticut/Research/ZIP11_Bioinformatic/capsule-3642152/code/step1_data_collection.py" 'zinc' 0 0 './results/zinc'
+"""
+
+import os
+import pathlib
+import sys
+import time
+import urllib.error
+
+sys.path.append('lib')
+from lib.Literature_Data_Collection import literature_data_collection
+
+if len(sys.argv) > 3:
+    word_query = str(sys.argv[1])
+    word_end_point = int(sys.argv[2])  # the endpoint of a word-based data collection. for demo-b 100000
+    gene_end_point = int(sys.argv[3])  # the endpoint of gene name-based data collection for demo-b 50
+    paths = str(sys.argv[4]) + '/'
+elif len(sys.argv) == 3:
+    word_query = str(sys.argv[1])
+    paths = str(sys.argv[2]) + '/'
+
+data_dir = os.path.abspath(os.getcwd())
+output_dir = os.path.join(data_dir, paths + 'baseline_doc')
+document_output_dir = os.path.join(data_dir, paths + 'gene2document')
+pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
+pathlib.Path(document_output_dir).mkdir(parents=True, exist_ok=True)
+
+email = "lrmercadod@gmail.com"  # Replace with your valid email address
+api_key = "19bea34a4dbdbc6ef30392cee15943365309"
+ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)
+
+# setting up
+########### word query based literature data collection #################
+gap = 1000
+batch = 200
+w2d_starting_point = 0
+
+try:
+    search_results, _word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
+    print('The number of available abstracts:', _word_end_point, 'for', word_query)
+    
+    if int(sys.argv[2]) == 0:
+        word_end_point = _word_end_point
+    
+    ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point,
+                                             ixs=w2d_starting_point, test_end_point=word_end_point)
+except urllib.error.HTTPError as e:
+    print(f"An HTTP error occurred: {e}")
+    print("Retrying in 5 seconds...")
+    time.sleep(5)
+    # Retry the request or handle the error appropriately
+
+########### gene name-query based literature data collection #################
+query_full = ld.text_open('./data/gene_name_info/query_full_name.txt')
+query_symbol = ld.text_open('./data/gene_name_info/query_symbol.txt')
+# gene name list
+query_size = len(query_full)
+ld.gene_based_query_fit(query_size, query_full, query_symbol)  # setting up
+
+g2d_starting_point = 0
+batch_size = 10
+
+############################
+gene_end_point = round(query_size / batch_size)
+
+if len(sys.argv) > 2:
+    gene_end_point = int(sys.argv[3])  # the endpoint of gene name-based data collection
+
+if int(sys.argv[3]) == 0:
+    gene_end_point = round(query_size / batch_size)
+
+ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point,
+                                         query_len=len(query_full), end_point=gene_end_point)
\ No newline at end of file
diff --git a/code/step2_data_preprocessing.py b/code/step2_data_preprocessing.py
new file mode 100644
index 0000000..9538008
--- /dev/null
+++ b/code/step2_data_preprocessing.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jun 21 00:16:25 2020
+
+@author: Jihye Moon
+
+"""
+
+import os
+import pathlib
+import sys
+
+sys.path.append('lib')  
+import lib.Literature_Data_Preprocessing as ldp
+
+base = sys.argv[1]
+output = sys.argv[2]
+batch_dir = base # os.path.join(base, 'literature_data')
+comb_dir = os.path.join(base, 'arranged')
+preprocessed_dir = os.path.join(output, 'preprocessed')
+pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
+pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)
+ 
+lp=ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir) 
+
+### Extracting only abstracts and combining all collected files into one file (Gene name based documents)
+file_names, data_list=lp.batch_data_matching(batch_dir, ['gene2document'])
+arr_list = lp.combining_files(file_names, data_list, ['FullText'], 3)
+
+for i in range(len(file_names)):
+    lp.Indexing(os.path.join(comb_dir, file_names[i]), arr_list[file_names[i]])
+    
+gene2doc = lp.gene2doc_mapping(arr_list[file_names[0]])
+ 
+
+### Extracting only abstracts and combining all collected files into one file (Word name based documents)
+file_names_doc, data_list_doc = lp.batch_data_matching(batch_dir, ['baseline_doc'])
+arr_list2 = lp.combining_query2doc(file_names_doc, data_list_doc, ['pubmed'], 4) 
+
+
+### Literature Data Preprocessing
+total_FullText = ''; total_meta = ''
+total_size=len(arr_list2[file_names_doc[0]])
+full_handle = open(os.path.join(comb_dir, file_names_doc[0]+'.FullText.txt'), "w")
+meta_handle = open(os.path.join(comb_dir, file_names_doc[0]+'.meta.txt'), "w")
+
+total_FullText=[]
+for i in range(total_size):
+    FullText, Meta = lp.Medine_mapping(arr_list2[file_names_doc[0]][i]) 
+    #print(i, '/', total_size, round(i/total_size,2)*100)
+    total_FullText.append(FullText)
+    full_handle.write(FullText)
+    meta_handle.write(Meta)
+full_handle.close()
+meta_handle.close()
+
+doc_gene=list(gene2doc.keys())
+
+print('----- preprocessing --- for gene name based documents')
+lp.making_doc_data(doc_gene, file_names[0], gene2doc) 
+
+print('----- preprocessing --- for word name based documents')
+lp.making_doc_data(None, file_names_doc[0], total_FullText)
diff --git a/code/step3_literature_embedding_training.py b/code/step3_literature_embedding_training.py
new file mode 100644
index 0000000..56af086
--- /dev/null
+++ b/code/step3_literature_embedding_training.py
@@ -0,0 +1,57 @@
+"""
+Created on Sun Jun 21 00:16:25 2020
+
+@author: Jihye Moon
+
+"""
+ 
+import pathlib
+import sys
+sys.path.append('lib')  
+import Building_Literature_Embedding_Model as edg
+
+window_size = 2
+min_count = 5
+min_size = 2
+dimension = 128
+num_sampled = 16
+batch_size = 564 #256
+epoch = 10
+
+root_path = sys.argv[1] #
+epoch = int(sys.argv[2])
+output = sys.argv[3]
+
+vocab_dir = output + '/vocab/'
+preprocessed_path = root_path + '/preprocessed'
+model_path = output 
+logs_dir = vocab_dir+'/logs'
+gene2doc_dir = logs_dir+'/gene2doc'
+baseline_doc_dir = logs_dir+'/baseline_doc'
+
+pathlib.Path(logs_dir).mkdir(parents=True, exist_ok=True)
+pathlib.Path(gene2doc_dir).mkdir(parents=True, exist_ok=True)
+pathlib.Path(baseline_doc_dir).mkdir(parents=True, exist_ok=True)
+pathlib.Path(model_path).mkdir(parents=True, exist_ok=True)
+
+print("==== Generating Training Data for Literature Embedding Model")
+eg=edg.building_embedding_model()
+eg.setting(preprocessed_path, vocab_dir, logs_dir, gene2doc_dir, baseline_doc_dir)
+
+print("==== Creating Vocabulary ===")
+eg.creating_vocab()
+
+print("=== Checking If Data Generation Is Correct ===")
+eg.checking_gene2doc_generation(window_size)
+print("=== Creating Training Data For Fig. 3(a) and (b) in our paper ===")
+eg.creating_training_data_for_gene2doc(window_size)
+
+print("=== Creating Training Data For Fig. 2 in our paper ===")
+eg.creating_training_data_for_word2doc(window_size)
+
+print("=== Starting Model Training For Figs.2-3 ===")
+eg.model_setting(dimension=dimension, num_sampled=num_sampled)
+eg.starting_sorting(model_path)
+eg.model_training(epoch=epoch, batch_size=batch_size)
+
+
diff --git a/code/step4_CVD_risk_factor_identification.py b/code/step4_CVD_risk_factor_identification.py
new file mode 100644
index 0000000..e597a8c
--- /dev/null
+++ b/code/step4_CVD_risk_factor_identification.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Aug 25 16:46:07 2022
+
+@ Journal: Expert Systems With Applications
+@ Title: A Literature Embedding Model for Cardiovascular Disease Prediction using Risk Factors, Symptoms, and Genotype Information
+@ Accepted Date: Aug. 24, 2024
+@ Author: Jihye Moon, Hugo F. Posada-Quintero, and *Ki. H. Chon
+@ Contact Email: jihye.moon@uconn.edu 
+
+""" 
+import pathlib
+
+import lib.CVD_risk_factor_search as ie
+import sys
+model = ie.run_intrisic_evaluation()
+
+model_path = str(sys.argv[1]) #'../data/old_model'
+output_path = str(sys.argv[2]) #'../results/demo_a'
+
+queries = ['Zn Transport', 'protein names', 'drug interactions', 'cancer', 'drug names', 'protein drug', 'zinc', 'zn pathway']
+TOPNUM = 25
+pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
+sys.path.append('lib')
+model.setting(path=model_path, gene_symb='../data/gene_name_info/query_symbol')
+
+for query in queries:
+    model.running(query, output_path, TOPNUM)
+
+
+
+
+
+
+
diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py
new file mode 100644
index 0000000..0ab1f4e
--- /dev/null
+++ b/code/step_1_data_collection_Luis.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jun 21 00:16:25 2020
+Updated to include robust retry mechanism and API rate limiting
+"""
+
+import os
+import pathlib
+import sys
+import time
+import urllib.error
+
+# Ensuring the correct append path for 'lib'
+sys.path.append(os.path.join(os.path.abspath(os.getcwd()), 'lib'))
+from lib.Loading_PudMed import ids_pudmed as pudmed
+
+class literature_data_collection:
+    def __init__(self, email, output_dir, document_output_dir, api_key=None):
+        self.output_dir = output_dir
+        self.document_output_dir = document_output_dir
+        self.email = email
+        self.api_key = api_key
+        print("Initialized literature_data_collection with email: {}".format(email))
+
+    def text_open(self, path):
+        with open(path, 'r') as f:
+            data = f.read().strip().split('\n')
+        return data
+
+    def word_based_query_fit(self, year=None, user_term="heart"):
+        pud = pudmed()
+        print("Created pudmed instance for searching.")
+        search_results, end_point = pud.search_list(user_term, year, self.email)
+        return search_results, end_point
+
+    def collecting_doc_using_word_based_query(self, year=None, user_term="heart", gap=50000, starting=0, ixs=0, test_end_point=0):
+        pud = pudmed()
+        print("Collecting documents using word-based query.")
+        search_results, end_point = pud.search_list(user_term, year, self.email)
+        if test_end_point != 0:
+            end_point = test_end_point
+        print('Checking data collection performance --- collecting until', end_point, 'documents')
+        next_start = starting
+        for ix in range(ixs, round(end_point/gap) + 1):
+            next_start = self.robust_request(ix, gap, next_start, end_point, 10000, pud, search_results)
+            if next_start >= end_point:
+                break
+
+    def robust_request(self, ix, gap, starting, end_point, batch, pud, search_results):
+        success = False
+        attempts = 0
+        while not success and attempts < 5:
+            try:
+                print(f"{ix} / {end_point // gap} | from {starting} to {min(starting + gap, end_point)}")
+                pud.search_full(ix, self.output_dir, search_results, starting, min(starting + gap, end_point), batch)
+                success = True
+            except urllib.error.HTTPError as e:
+                attempts += 1
+                wait_time = 2 ** attempts
+                print(f"An HTTP error occurred: {e}")
+                print(f"Retrying in {wait_time} seconds...")
+                time.sleep(wait_time)
+
+        if not success:
+            print("Failed after 5 attempts, skipping this batch.")
+        return starting + gap  # Returns the next starting point
+
+if __name__ == "__main__":
+    if len(sys.argv) > 3:
+        word_query = str(sys.argv[1])
+        word_end_point = int(sys.argv[2])
+        gene_end_point = int(sys.argv[3])
+        paths = str(sys.argv[4]) + '/'
+    elif len(sys.argv) == 3:
+        word_query = str(sys.argv[1])
+        paths = str(sys.argv[2]) + '/'
+
+    data_dir = os.path.abspath(os.getcwd())
+    output_dir = os.path.join(data_dir, paths + 'baseline_doc')
+    document_output_dir = os.path.join(data_dir, paths + 'gene2document')
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(document_output_dir, exist_ok=True)
+
+    email = "lrmercadod@gmail.com"  # Replace with your valid email address
+    api_key = "19bea34a4dbdbc6ef30392cee15943365309"
+    ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)
+
+    gap = 50000  # Adjust as needed
+    batch = 10000  # Adjust as needed
+    w2d_starting_point = 0  # Adjust if resuming from a different point
+
+    try:
+        search_results, word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
+        print('The number of available abstracts:', word_end_point, 'for', word_query)
+
+        if int(sys.argv[2]) == 0:
+            word_end_point = word_end_point
+
+        ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point, ixs=w2d_starting_point, test_end_point=word_end_point)
+    except urllib.error.HTTPError as e:
+        print(f"An HTTP error occurred: {e}")
+        print("Retrying in 5 seconds...")
+        time.sleep(5)
+
+    # Assuming gene data is prepared and ready to be processed
+    try:
+        query_full = ld.text_open('data/gene_name_info/query_full_name.txt')  # Adjust path as necessary
+        query_symbol = ld.text_open('data/gene_name_info/query_symbol.txt')  # Adjust path as necessary
+        query_size = len(query_full)
+        ld.gene_based_query_fit(query_size, query_full, query_symbol)
+
+        g2d_starting_point = 0
+        batch_size = 10
+        gene_end_point = round(query_size / batch_size)
+        if len(sys.argv) > 2:
+            gene_end_point = int(sys.argv[3])
+        if int(sys.argv[3]) == 0:
+            gene_end_point = round(query_size / batch_size)
+
+        ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point, query_len=query_size, end_point=gene_end_point)
+    except Exception as e:
+        print(f"Error during gene-based data collection: {e}")
diff --git a/code/step_1_data_collection_Luis_.py b/code/step_1_data_collection_Luis_.py
new file mode 100644
index 0000000..4e313c0
--- /dev/null
+++ b/code/step_1_data_collection_Luis_.py
@@ -0,0 +1,21 @@
+from Bio import Entrez
+import time
+def download_data(query, batch_size=1000, delay=1):
+    Entrez.email = "your.email@example.com"
+    handle = Entrez.esearch(db="pubmed", term=query, retmax=1000000)
+    record = Entrez.read(handle)
+    ids = record["IdList"]
+    total = len(ids)
+    print(f"Total number of records: {total}")
+    for i in range(0, total, batch_size):
+        print(f"Downloading records {i+1}-{min(i+batch_size, total)}")
+        ids_batch = ids[i:i+batch_size]
+        handle = Entrez.efetch(db="pubmed", id=",".join(ids_batch), rettype="medline", retmode="text")
+        data = handle.read()
+        # Do something with the data, e.g., save it to a file
+        with open("data.txt", "a", encoding='utf-8') as f:
+            f.write(data)
+        handle.close()
+        time.sleep(delay)
+        
+download_data("zinc")
\ No newline at end of file
diff --git a/environment/Dockerfile b/environment/Dockerfile
new file mode 100644
index 0000000..89eaa02
--- /dev/null
+++ b/environment/Dockerfile
@@ -0,0 +1,7 @@
+# hash:sha256:e465d6106ff500ccbaa462a26e7c4b6ff7aade26df19638552849815bd95e8dc
+FROM registry.codeocean.com/codeocean/tensorflow:1.4.0-python3.5.2-cuda8.0.61-cudnn6.0.21-ubuntu16.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN pip3 install -U --no-cache-dir --upgrade-strategy=only-if-needed \
+    nltk==3.6.2
diff --git a/error_log.txt b/error_log.txt
new file mode 100644
index 0000000..d944cb9
--- /dev/null
+++ b/error_log.txt
@@ -0,0 +1,371 @@
+Error writing SNP ID 1401488244 for gene ID 100134391: [Errno 22] Invalid argument
+Error writing SNP ID 1321762266 for gene ID 100134391: [Errno 22] Invalid argument
+Error writing SNP ID 1242721601 for gene ID 100134391: [Errno 22] Invalid argument
+Error writing SNP ID 1162358699 for gene ID 100134391: [Errno 22] Invalid argument
+Error writing SNP ID 989444980 for gene ID 100134391: [Errno 22] Invalid argument
+Error writing SNP ID 908631462 for gene ID 100134391: [Errno 22] Invalid argument
+Error writing SNP ID 555494967 for gene ID 100134391: [Errno 22] Invalid argument
+Error writing SNP ID 67190938 for gene ID 100134391: [Errno 22] Invalid argument
+Error writing SNP ID 1454934271 for gene ID 100133920: [Errno 22] Invalid argument
+Error writing SNP ID 1378719306 for gene ID 100133920: [Errno 22] Invalid argument
+Error writing SNP ID 1304374560 for gene ID 100133920: [Errno 22] Invalid argument
+Error writing SNP ID 1228792142 for gene ID 100133920: [Errno 22] Invalid argument
+Error writing SNP ID 3055566 for gene ID 100133920: [Errno 22] Invalid argument
+Error writing SNP ID 1377065813 for gene ID 100133331: [Errno 22] Invalid argument
+Error writing SNP ID 1203997153 for gene ID 100133331: [Errno 22] Invalid argument
+Error writing SNP ID 1475953306 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 1422174919 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 1367196304 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 1312430718 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 1254929479 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 1198189379 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 1035939999 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 943090697 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 761888488 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 537149284 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 6559249 for gene ID 100133077: [Errno 22] Invalid argument
+Error writing SNP ID 1459552283 for gene ID 100131472: [Errno 22] Invalid argument
+Error writing SNP ID 1215552070 for gene ID 100131472: [Errno 22] Invalid argument
+Error writing SNP ID 190355052 for gene ID 100131472: [Errno 22] Invalid argument
+Error writing SNP ID 1317791784 for gene ID 100131372: [Errno 22] Invalid argument
+Error writing SNP ID 1032440441 for gene ID 100131372: [Errno 22] Invalid argument
+Error writing SNP ID 774350025 for gene ID 100131372: [Errno 22] Invalid argument
+Error writing SNP ID 1403715705 for gene ID 100131289: [Errno 22] Invalid argument
+Error writing SNP ID 13192480 for gene ID 100131289: [Errno 22] Invalid argument
+Error writing SNP ID 1411086768 for gene ID 100131257: [Errno 22] Invalid argument
+Error writing SNP ID 1331412871 for gene ID 100131257: [Errno 22] Invalid argument
+Error writing SNP ID 1250795715 for gene ID 100131257: [Errno 22] Invalid argument
+Error writing SNP ID 1166693659 for gene ID 100131257: [Errno 22] Invalid argument
+Error writing SNP ID 985746876 for gene ID 100131257: [Errno 22] Invalid argument
+Error writing SNP ID 897042239 for gene ID 100131257: [Errno 22] Invalid argument
+Error writing SNP ID 549557861 for gene ID 100131257: [Errno 22] Invalid argument
+Error writing SNP ID 6463645 for gene ID 100131257: [Errno 22] Invalid argument
+Error writing SNP ID 1162482167 for gene ID 100131096: [Errno 22] Invalid argument
+Error writing SNP ID 569206848 for gene ID 100131096: [Errno 22] Invalid argument
+Error writing SNP ID 1440931989 for gene ID 100130964: [Errno 22] Invalid argument
+Error writing SNP ID 1331485874 for gene ID 100130964: [Errno 22] Invalid argument
+Error writing SNP ID 1218107497 for gene ID 100130964: [Errno 22] Invalid argument
+Error writing SNP ID 1015952163 for gene ID 100130964: [Errno 22] Invalid argument
+Error writing SNP ID 905187583 for gene ID 100130964: [Errno 22] Invalid argument
+Error writing SNP ID 187544859 for gene ID 100130964: [Errno 22] Invalid argument
+Error writing SNP ID 1403512839 for gene ID 100130876: [Errno 22] Invalid argument
+Error writing SNP ID 1160588349 for gene ID 100130876: [Errno 22] Invalid argument
+Error writing SNP ID 58269148 for gene ID 100130876: [Errno 22] Invalid argument
+Error writing SNP ID 1224558074 for gene ID 100130744: [Errno 22] Invalid argument
+Error writing SNP ID 761414982 for gene ID 100130744: [Errno 22] Invalid argument
+Error writing SNP ID 1434312909 for gene ID 100130698: [Errno 22] Invalid argument
+Error writing SNP ID 1266168830 for gene ID 100130698: [Errno 22] Invalid argument
+Error writing SNP ID 950291325 for gene ID 100130698: [Errno 22] Invalid argument
+Error writing SNP ID 1444085150 for gene ID 100130673: [Errno 22] Invalid argument
+Error writing SNP ID 535193915 for gene ID 100130673: [Errno 22] Invalid argument
+Error writing SNP ID 1395244138 for gene ID 100130587: [Errno 22] Invalid argument
+Error writing SNP ID 1276465865 for gene ID 100130587: [Errno 22] Invalid argument
+Error writing SNP ID 1054734281 for gene ID 100130587: [Errno 22] Invalid argument
+Error writing SNP ID 949279994 for gene ID 100130587: [Errno 22] Invalid argument
+Error writing SNP ID 567692067 for gene ID 100130587: [Errno 22] Invalid argument
+Error writing SNP ID 1452446245 for gene ID 100130502: [Errno 22] Invalid argument
+Error writing SNP ID 948845085 for gene ID 100130502: [Errno 22] Invalid argument
+Error writing SNP ID 1427296645 for gene ID 100130452: [Errno 22] Invalid argument
+Error writing SNP ID 1292396931 for gene ID 100130452: [Errno 22] Invalid argument
+Error writing SNP ID 1054257971 for gene ID 100130452: [Errno 22] Invalid argument
+Error writing SNP ID 924124821 for gene ID 100130452: [Errno 22] Invalid argument
+Error writing SNP ID 386654020 for gene ID 100130452: [Errno 22] Invalid argument
+Error writing SNP ID 1473991555 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1444752669 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1415506922 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1387168159 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1359171059 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1331689879 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1303506748 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1275744506 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1247356556 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1217490966 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1189421478 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1159376449 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1031959707 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1004368436 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 973227273 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 943672268 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 914646414 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 868593534 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 761427066 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 568918206 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 540705553 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 191753980 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 113244480 for gene ID 100130331: [Errno 22] Invalid argument
+Error writing SNP ID 1469723439 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 1383298658 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 1301222265 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 1219936237 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 1039151310 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 958735062 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 776464949 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 547097852 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 11862298 for gene ID 100130283: [Errno 22] Invalid argument
+Error writing SNP ID 1467965535 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1440863038 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1411124420 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1384161481 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1356135859 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1328668354 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1300706364 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1274407944 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1244911549 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1216192390 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1188007456 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1160028300 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1034054128 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 1007664231 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 979232284 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 949923982 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 920153574 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 890526906 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 756917908 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 559118170 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 529478619 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 180920765 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 6724403 for gene ID 100130256: [Errno 22] Invalid argument
+Error writing SNP ID 953022923 for gene ID 100130083: [Errno 22] Invalid argument
+Error writing SNP ID 1467821831 for gene ID 100129931: [Errno 22] Invalid argument
+Error writing SNP ID 1367706192 for gene ID 100129931: [Errno 22] Invalid argument
+Error writing SNP ID 1269846634 for gene ID 100129931: [Errno 22] Invalid argument
+Error writing SNP ID 1170898671 for gene ID 100129931: [Errno 22] Invalid argument
+Error writing SNP ID 979696164 for gene ID 100129931: [Errno 22] Invalid argument
+Error writing SNP ID 867916940 for gene ID 100129931: [Errno 22] Invalid argument
+Error writing SNP ID 547973127 for gene ID 100129931: [Errno 22] Invalid argument
+Error writing SNP ID 1468230327 for gene ID 100129697: [Errno 22] Invalid argument
+Error writing SNP ID 1360086656 for gene ID 100129697: [Errno 22] Invalid argument
+Error writing SNP ID 1250191056 for gene ID 100129697: [Errno 22] Invalid argument
+Error writing SNP ID 1039629136 for gene ID 100129697: [Errno 22] Invalid argument
+Error writing SNP ID 937306662 for gene ID 100129697: [Errno 22] Invalid argument
+Error writing SNP ID 555779068 for gene ID 100129697: [Errno 22] Invalid argument
+Error writing SNP ID 1415899090 for gene ID 100129503: [Errno 22] Invalid argument
+Error writing SNP ID 943081523 for gene ID 100129503: [Errno 22] Invalid argument
+Error writing SNP ID 1295225460 for gene ID 100129476: [Errno 22] Invalid argument
+Error writing SNP ID 191836561 for gene ID 100129476: [Errno 22] Invalid argument
+Error writing SNP ID 943272702 for gene ID 100129473: [Errno 22] Invalid argument
+Error writing SNP ID 1405799168 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 1246006261 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 1005913565 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 756408635 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 1451479264 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1219896618 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 902211470 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1455535940 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1236509623 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 911093945 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1397498615 for gene ID 100128818: [Errno 22] Invalid argument
+Error writing SNP ID 903527749 for gene ID 100128818: [Errno 22] Invalid argument
+Error writing SNP ID 1386379492 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1208191345 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 932246457 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 201207482 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1332246126 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 974348888 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 112799661 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 941742804 for gene ID 100128573: [Errno 22] Invalid argument
+Error writing SNP ID 1381978137 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 1166093305 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 762463840 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 75048709 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 1300239219 for gene ID 100128364: [Errno 22] Invalid argument
+Error writing SNP ID 879683159 for gene ID 100128364: [Errno 22] Invalid argument
+Error writing SNP ID 1467425305 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1414148569 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1357698689 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1302973516 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1245679701 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1193041929 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1019490115 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 899338912 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 535310734 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 4958990 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1377015885 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1266055812 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1053294258 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 782582963 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 11783919 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1215258193 for gene ID 100128276: [Errno 22] Invalid argument
+Error writing SNP ID 532895802 for gene ID 100128276: [Errno 22] Invalid argument
+Error writing SNP ID 1289133122 for gene ID 100128242: [Errno 22] Invalid argument
+Error writing SNP ID 949052526 for gene ID 100128242: [Errno 22] Invalid argument
+Error writing SNP ID 1451230755 for gene ID 100128002: [Errno 22] Invalid argument
+Error writing SNP ID 1041973346 for gene ID 100128002: [Errno 22] Invalid argument
+Error writing SNP ID 1472508296 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1425307614 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1374924715 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1328108783 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1274324822 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1225391909 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1174650019 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1030991467 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 985935809 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 939018273 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 891318435 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 570684633 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 368760172 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 9482609 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1354561271 for gene ID 100126447: [Errno 22] Invalid argument
+Error writing SNP ID 1183159695 for gene ID 100126447: [Errno 22] Invalid argument
+Error writing SNP ID 777074097 for gene ID 100126447: [Errno 22] Invalid argument
+Error writing SNP ID 1168689953 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 937573888 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 368414153 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 1340576577 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1013579192 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 552165477 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1348791799 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1025875099 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 529685715 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1178194412 for gene ID 100128818: [Errno 22] Invalid argument
+Error writing SNP ID 1476717529 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1298193237 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1021561998 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 760988656 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1460770472 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 1214815613 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 759553552 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 1276633074 for gene ID 100128573: [Errno 22] Invalid argument
+Error writing SNP ID 1168689953 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 937573888 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 368414153 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 1340576577 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1013579192 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 552165477 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1348791799 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1025875099 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 529685715 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1178194412 for gene ID 100128818: [Errno 22] Invalid argument
+Error writing SNP ID 1476717529 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1298193237 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1021561998 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 760988656 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1460770472 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 1214815613 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 759553552 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 1276633074 for gene ID 100128573: [Errno 22] Invalid argument
+Error writing SNP ID 1168689953 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 937573888 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 368414153 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 1340576577 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1013579192 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 552165477 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1348791799 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1025875099 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 529685715 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1178194412 for gene ID 100128818: [Errno 22] Invalid argument
+Error writing SNP ID 1476717529 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1298193237 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1021561998 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 760988656 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1428970358 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 1179412392 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 573057482 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 1212664435 for gene ID 100128573: [Errno 22] Invalid argument
+Error writing SNP ID 1465316026 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 1250211435 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 778620142 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 376861788 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 1375702769 for gene ID 100128364: [Errno 22] Invalid argument
+Error writing SNP ID 1170374501 for gene ID 100128364: [Errno 22] Invalid argument
+Error writing SNP ID 1486847565 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1434780581 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1380171432 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1321965948 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1268442373 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1211745332 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1158450146 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 945227899 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 562613918 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 113812210 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1419983823 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1306603154 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1195941299 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 940958709 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 531284336 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1327600424 for gene ID 100128276: [Errno 22] Invalid argument
+Error writing SNP ID 914601310 for gene ID 100128276: [Errno 22] Invalid argument
+Error writing SNP ID 1388102325 for gene ID 100128242: [Errno 22] Invalid argument
+Error writing SNP ID 1046670781 for gene ID 100128242: [Errno 22] Invalid argument
+Error writing SNP ID 187067585 for gene ID 100128242: [Errno 22] Invalid argument
+Error writing SNP ID 1271519673 for gene ID 100128002: [Errno 22] Invalid argument
+Error writing SNP ID 1490479661 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1442506271 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1393092454 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1346134640 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1292731845 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1243645858 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1192693850 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1046293712 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1004119480 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 956269867 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 909766099 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 757690502 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 539300770 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 114417332 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1414816038 for gene ID 100126447: [Errno 22] Invalid argument
+Error writing SNP ID 1250136823 for gene ID 100126447: [Errno 22] Invalid argument
+Error writing SNP ID 958271220 for gene ID 100126447: [Errno 22] Invalid argument
+Error writing SNP ID 1168689953 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 937573888 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 368414153 for gene ID 100129316: [Errno 22] Invalid argument
+Error writing SNP ID 1340576577 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1013579192 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 552165477 for gene ID 100129215: [Errno 22] Invalid argument
+Error writing SNP ID 1348791799 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1025875099 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 529685715 for gene ID 100129098: [Errno 22] Invalid argument
+Error writing SNP ID 1178194412 for gene ID 100128818: [Errno 22] Invalid argument
+Error writing SNP ID 1476717529 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1298193237 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1021561998 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 760988656 for gene ID 100128770: [Errno 22] Invalid argument
+Error writing SNP ID 1428970358 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 1179412392 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 573057482 for gene ID 100128593: [Errno 22] Invalid argument
+Error writing SNP ID 1212664435 for gene ID 100128573: [Errno 22] Invalid argument
+Error writing SNP ID 1465316026 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 1250211435 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 778620142 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 376861788 for gene ID 100128494: [Errno 22] Invalid argument
+Error writing SNP ID 1375702769 for gene ID 100128364: [Errno 22] Invalid argument
+Error writing SNP ID 1170374501 for gene ID 100128364: [Errno 22] Invalid argument
+Error writing SNP ID 1486847565 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1434780581 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1380171432 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1321965948 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1268442373 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1211745332 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1158450146 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 945227899 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 562613918 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 113812210 for gene ID 100128340: [Errno 22] Invalid argument
+Error writing SNP ID 1419983823 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1306603154 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1195941299 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 940958709 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 531284336 for gene ID 100128338: [Errno 22] Invalid argument
+Error writing SNP ID 1327600424 for gene ID 100128276: [Errno 22] Invalid argument
+Error writing SNP ID 914601310 for gene ID 100128276: [Errno 22] Invalid argument
+Error writing SNP ID 1388102325 for gene ID 100128242: [Errno 22] Invalid argument
+Error writing SNP ID 1046670781 for gene ID 100128242: [Errno 22] Invalid argument
+Error writing SNP ID 187067585 for gene ID 100128242: [Errno 22] Invalid argument
+Error writing SNP ID 1022038 for gene ID 100128059: [Errno 22] Invalid argument
+Error writing SNP ID 1207012948 for gene ID 100128002: [Errno 22] Invalid argument
+Error writing SNP ID 1480734208 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1433052188 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1383813550 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1336828598 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1282866197 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1234863863 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1183687700 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1037657981 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 995080433 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 947092637 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 900183125 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 746594399 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 529397192 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 73771235 for gene ID 100126584: [Errno 22] Invalid argument
+Error writing SNP ID 1379383050 for gene ID 100126447: [Errno 22] Invalid argument
+Error writing SNP ID 1211903818 for gene ID 100126447: [Errno 22] Invalid argument
+Error writing SNP ID 913589155 for gene ID 100126447: [Errno 22] Invalid argument
diff --git a/metadata/metadata.yml b/metadata/metadata.yml
new file mode 100644
index 0000000..3ffef0f
--- /dev/null
+++ b/metadata/metadata.yml
@@ -0,0 +1,31 @@
+metadata_version: 1
+name: Copy of A Literature Embedding Model for Cardiovascular Disease Prediction using
+  Risk Factors, Symptoms, and Genotype Information
+description: We have developed a literature embedding model to identify significant
+  cardiovascular disease (CVD) risk factors and associated information. Our model
+  that trained using literature data and retrieve CVD risk factors and significant
+  information related to a given query. Our model can be used with CVD prediction
+  on cohort data as feature selection (FS) and dimensionality reduction (DR) tasks.
+  This capsule provides all procedures for literature data collection/pre-processing,
+  literature model training process, CVD risk factor identifications, and FS and DR
+  applications for CVD prediction on cohort data.
+tags:
+- Information Retrieval
+- Knowledge Representation
+- Machine Intelligence
+- cardiovascular-risk
+- Machine Learning
+- Natural Language Processing
+authors:
+- name: Jihye Moon
+  affiliations:
+  - name: University of Connecticut
+- name: Hugo F. Posada-Quintero
+  affiliations:
+  - name: University of Connecticut
+- name: Ki H. Chon
+  affiliations:
+  - name: University of Connecticut
+corresponding_contributor:
+  name: Ki H. Chon
+  email: ki.chon@uconn.edu