Genes and Query extraction

Implementation of an exogenous gene extraction method.
lrm22005 · May 8, 2024 · f16f5d2 · f16f5d2
1 parent 860d0fe
commit f16f5d2
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -38,3 +38,4 @@ results/baseline_doc/pubmed.zinc.2.15.txt
 results/baseline_doc/zinc AND 2013\[Date\].2013.txt
 /results
 config.ini
+/gene_based_records
diff --git a/checkpoint.json b/checkpoint.json
@@ -0,0 +1 @@
+{"tumor protein p53": "./gene_based_records/tumor_protein_p53.txt", "epidermal growth factor receptor": "./gene_based_records/epidermal_growth_factor_receptor.txt", "apolipoprotein E": "./gene_based_records/apolipoprotein_E.txt", "tumor necrosis factor": "./gene_based_records/tumor_necrosis_factor.txt", "vascular endothelial growth factor A": "./gene_based_records/vascular_endothelial_growth_factor_A.txt", "interleukin 6": "./gene_based_records/interleukin_6.txt", "transforming growth factor beta 1": "./gene_based_records/transforming_growth_factor_beta_1.txt", "methylenetetrahydrofolate reductase": "./gene_based_records/methylenetetrahydrofolate_reductase.txt", "hypoxia inducible factor 1 subunit alpha": "./gene_based_records/hypoxia_inducible_factor_1_subunit_alpha.txt", "erb-b2 receptor tyrosine kinase 2": "./gene_based_records/erb-b2_receptor_tyrosine_kinase_2.txt", "estrogen receptor 1": "./gene_based_records/estrogen_receptor_1.txt", "interleukin 10": "./gene_based_records/interleukin_10.txt", "amyloid beta precursor protein": "./gene_based_records/amyloid_beta_precursor_protein.txt", "signal transducer and activator of transcription 3": "./gene_based_records/signal_transducer_and_activator_of_transcription_3.txt", "BRCA1 DNA repair associated": "./gene_based_records/BRCA1_DNA_repair_associated.txt", "angiotensin I converting enzyme": "./gene_based_records/angiotensin_I_converting_enzyme.txt", "KRAS proto-oncogene, GTPase": "./gene_based_records/KRAS_proto-oncogene,_GTPase.txt", "brain derived neurotrophic factor": "./gene_based_records/brain_derived_neurotrophic_factor.txt", "B-Raf proto-oncogene, serine/threonine kinase": "./gene_based_records/B-Raf_proto-oncogene,_serine_threonine_kinase.txt", "matrix metallopeptidase 9": "./gene_based_records/matrix_metallopeptidase_9.txt", "vitamin D receptor": "./gene_based_records/vitamin_D_receptor.txt", "C-reactive protein": "./gene_based_records/C-reactive_protein.txt", "CD274 molecule": "./gene_based_records/CD274_molecule.txt", "androgen receptor": "./gene_based_records/androgen_receptor.txt", "adiponectin, C1Q and collagen domain containing": "./gene_based_records/adiponectin,_C1Q_and_collagen_domain_containing.txt", "AKT serine/threonine kinase 1": "./gene_based_records/AKT_serine_threonine_kinase_1.txt", "ATP binding cassette subfamily B member 1": "./gene_based_records/ATP_binding_cassette_subfamily_B_member_1.txt", "nuclear factor kappa B subunit 1": "./gene_based_records/nuclear_factor_kappa_B_subunit_1.txt", "interleukin 1 beta": "./gene_based_records/interleukin_1_beta.txt", "major histocompatibility complex, class II, DR beta 1": "./gene_based_records/major_histocompatibility_complex,_class_II,_DR_beta_1.txt"}
diff --git a/code/lib/Loading_PudMed.py b/code/lib/Loading_PudMed.py
@@ -12,6 +12,7 @@
 from io import StringIO
 import time
 sys.path.append('lib')
+from http.client import IncompleteRead
 
 from Bio import Medline
 import os
@@ -216,7 +217,7 @@ def search_full(self, ix, data_dir, search_results, starting, count, batch):
             data = fetch_handle.read()
             fetch_handle.close()
             out_handle.write(data)
-            time.sleep(2)  # Delay between each batch fetch to respect the API rate limit
+            time.sleep(5)  # Delay between each batch fetch to respect the API rate limit
         out_handle.close()
 
     def fetch_rec(self, rec_id, entrez_handle):

diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py
@@ -12,7 +12,7 @@
 sys.path.append('lib')  
 from lib.Literature_Data_Collection import literature_data_collection
 
-years = 15
+years = 35
 
 if len(sys.argv)>3:
     word_query = str(sys.argv[1])
@@ -52,7 +52,7 @@
 ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up
 
 g2d_starting_point = 0 
-batch_size = 1000
+batch_size = 100
 #############################
 #####################
 gene_end_point = round(query_size/batch_size)

diff --git a/code/step1_data_collection_Luis_genes.py b/code/step1_data_collection_Luis_genes.py
@@ -0,0 +1,94 @@
+import requests
+from Bio import Entrez
+from io import BytesIO
+import time
+import os
+import logging
+import json
+
+class GenePubMedDownloader:
+    def __init__(self, api_key, email, output_dir, max_records_per_query=9999, checkpoint_file="checkpoint.json"):
+        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        self.api_key = api_key
+        self.email = email
+        self.max_records_per_query = max_records_per_query
+        Entrez.email = email  # Set email for NCBI E-utilities
+        self.output_dir = output_dir
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.checkpoint_file = checkpoint_file
+        self.checkpoint_data = self.load_checkpoint()
+
+    def fetch_pubmed_data(self, gene_name):
+        if gene_name in self.checkpoint_data:
+            logging.info(f"Skipping {gene_name} (already processed)")
+            return self.checkpoint_data[gene_name]
+
+        normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
+        records = []
+        attempt = 0
+        max_attempts = 5
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    webenv = search_results['WebEnv']
+                    query_key = search_results['QueryKey']
+                    count = int(search_results['Count'])
+                    logging.info(f"Total records found for {gene_name}: {count}")
+                    if count > 0:
+                        for start in range(0, count, self.max_records_per_query):
+                            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax=min(count - start, self.max_records_per_query)&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                            fetch_response = requests.get(fetch_url, timeout=10)
+                            records.append(fetch_response.text)
+                            logging.info(f"Fetched records for {gene_name} starting from {start}")
+                    file_path = self.save_records_to_file(normalized_gene, records)
+                    self.checkpoint_data[gene_name] = file_path
+                    self.save_checkpoint()
+                    return file_path
+                break
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                logging.error(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)
+        return []
+
+    def save_records_to_file(self, gene_name, records):
+        filename = f"{gene_name}.txt"
+        file_path = os.path.join(self.output_dir, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))
+        logging.info(f"Saved records for {gene_name} to {file_path}")
+        return file_path
+
+    def load_checkpoint(self):
+        if os.path.exists(self.checkpoint_file):
+            with open(self.checkpoint_file, 'r') as file:
+                return json.load(file)
+        return {}
+
+    def save_checkpoint(self):
+        with open(self.checkpoint_file, 'w') as file:
+            json.dump(self.checkpoint_data, file)
+
+def load_gene_names(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        return [line.strip() for line in file if line.strip()]
+
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Example Usage
+api_key = "19bea34a4dbdbc6ef30392cee15943365309"
+email = "lrmercadod@gmail.com"
+output_dir = "./gene_based_records/"
+downloader = GenePubMedDownloader(api_key, email, output_dir)
+
+# Load gene names and symbols
+full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
+symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')
+
+# Fetch records for each gene name and symbol
+for gene in full_names + symbols:
+    downloader.fetch_pubmed_data(gene)
diff --git a/code/step_1_data_collection_Luis_.py → code/step_1_data_collection_Luis_query.py b/code/step_1_data_collection_Luis_.py → code/step_1_data_collection_Luis_query.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"tumor protein p53": "./gene_based_records/tumor_protein_p53.txt", "epidermal growth factor receptor": "./gene_based_records/epidermal_growth_factor_receptor.txt", "apolipoprotein E": "./gene_based_records/apolipoprotein_E.txt", "tumor necrosis factor": "./gene_based_records/tumor_necrosis_factor.txt", "vascular endothelial growth factor A": "./gene_based_records/vascular_endothelial_growth_factor_A.txt", "interleukin 6": "./gene_based_records/interleukin_6.txt", "transforming growth factor beta 1": "./gene_based_records/transforming_growth_factor_beta_1.txt", "methylenetetrahydrofolate reductase": "./gene_based_records/methylenetetrahydrofolate_reductase.txt", "hypoxia inducible factor 1 subunit alpha": "./gene_based_records/hypoxia_inducible_factor_1_subunit_alpha.txt", "erb-b2 receptor tyrosine kinase 2": "./gene_based_records/erb-b2_receptor_tyrosine_kinase_2.txt", "estrogen receptor 1": "./gene_based_records/estrogen_receptor_1.txt", "interleukin 10": "./gene_based_records/interleukin_10.txt", "amyloid beta precursor protein": "./gene_based_records/amyloid_beta_precursor_protein.txt", "signal transducer and activator of transcription 3": "./gene_based_records/signal_transducer_and_activator_of_transcription_3.txt", "BRCA1 DNA repair associated": "./gene_based_records/BRCA1_DNA_repair_associated.txt", "angiotensin I converting enzyme": "./gene_based_records/angiotensin_I_converting_enzyme.txt", "KRAS proto-oncogene, GTPase": "./gene_based_records/KRAS_proto-oncogene,_GTPase.txt", "brain derived neurotrophic factor": "./gene_based_records/brain_derived_neurotrophic_factor.txt", "B-Raf proto-oncogene, serine/threonine kinase": "./gene_based_records/B-Raf_proto-oncogene,_serine_threonine_kinase.txt", "matrix metallopeptidase 9": "./gene_based_records/matrix_metallopeptidase_9.txt", "vitamin D receptor": "./gene_based_records/vitamin_D_receptor.txt", "C-reactive protein": "./gene_based_records/C-reactive_protein.txt", "CD274 molecule": "./gene_based_records/CD274_molecule.txt", "androgen receptor": "./gene_based_records/androgen_receptor.txt", "adiponectin, C1Q and collagen domain containing": "./gene_based_records/adiponectin,_C1Q_and_collagen_domain_containing.txt", "AKT serine/threonine kinase 1": "./gene_based_records/AKT_serine_threonine_kinase_1.txt", "ATP binding cassette subfamily B member 1": "./gene_based_records/ATP_binding_cassette_subfamily_B_member_1.txt", "nuclear factor kappa B subunit 1": "./gene_based_records/nuclear_factor_kappa_B_subunit_1.txt", "interleukin 1 beta": "./gene_based_records/interleukin_1_beta.txt", "major histocompatibility complex, class II, DR beta 1": "./gene_based_records/major_histocompatibility_complex,_class_II,_DR_beta_1.txt"}