diff --git a/.gitignore b/.gitignore index 8dc7703..7531fff 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,4 @@ results/baseline_doc/pubmed.zinc.2.15.txt results/baseline_doc/zinc AND 2013\[Date\].2013.txt /results config.ini +/gene_based_records diff --git a/checkpoint.json b/checkpoint.json new file mode 100644 index 0000000..c4d6d57 --- /dev/null +++ b/checkpoint.json @@ -0,0 +1 @@ +{"tumor protein p53": "./gene_based_records/tumor_protein_p53.txt", "epidermal growth factor receptor": "./gene_based_records/epidermal_growth_factor_receptor.txt", "apolipoprotein E": "./gene_based_records/apolipoprotein_E.txt", "tumor necrosis factor": "./gene_based_records/tumor_necrosis_factor.txt", "vascular endothelial growth factor A": "./gene_based_records/vascular_endothelial_growth_factor_A.txt", "interleukin 6": "./gene_based_records/interleukin_6.txt", "transforming growth factor beta 1": "./gene_based_records/transforming_growth_factor_beta_1.txt", "methylenetetrahydrofolate reductase": "./gene_based_records/methylenetetrahydrofolate_reductase.txt", "hypoxia inducible factor 1 subunit alpha": "./gene_based_records/hypoxia_inducible_factor_1_subunit_alpha.txt", "erb-b2 receptor tyrosine kinase 2": "./gene_based_records/erb-b2_receptor_tyrosine_kinase_2.txt", "estrogen receptor 1": "./gene_based_records/estrogen_receptor_1.txt", "interleukin 10": "./gene_based_records/interleukin_10.txt", "amyloid beta precursor protein": "./gene_based_records/amyloid_beta_precursor_protein.txt", "signal transducer and activator of transcription 3": "./gene_based_records/signal_transducer_and_activator_of_transcription_3.txt", "BRCA1 DNA repair associated": "./gene_based_records/BRCA1_DNA_repair_associated.txt", "angiotensin I converting enzyme": "./gene_based_records/angiotensin_I_converting_enzyme.txt", "KRAS proto-oncogene, GTPase": "./gene_based_records/KRAS_proto-oncogene,_GTPase.txt", "brain derived neurotrophic factor": "./gene_based_records/brain_derived_neurotrophic_factor.txt", "B-Raf proto-oncogene, serine/threonine kinase": "./gene_based_records/B-Raf_proto-oncogene,_serine_threonine_kinase.txt", "matrix metallopeptidase 9": "./gene_based_records/matrix_metallopeptidase_9.txt", "vitamin D receptor": "./gene_based_records/vitamin_D_receptor.txt", "C-reactive protein": "./gene_based_records/C-reactive_protein.txt", "CD274 molecule": "./gene_based_records/CD274_molecule.txt", "androgen receptor": "./gene_based_records/androgen_receptor.txt", "adiponectin, C1Q and collagen domain containing": "./gene_based_records/adiponectin,_C1Q_and_collagen_domain_containing.txt", "AKT serine/threonine kinase 1": "./gene_based_records/AKT_serine_threonine_kinase_1.txt", "ATP binding cassette subfamily B member 1": "./gene_based_records/ATP_binding_cassette_subfamily_B_member_1.txt", "nuclear factor kappa B subunit 1": "./gene_based_records/nuclear_factor_kappa_B_subunit_1.txt", "interleukin 1 beta": "./gene_based_records/interleukin_1_beta.txt", "major histocompatibility complex, class II, DR beta 1": "./gene_based_records/major_histocompatibility_complex,_class_II,_DR_beta_1.txt"} \ No newline at end of file diff --git a/code/lib/Loading_PudMed.py b/code/lib/Loading_PudMed.py index 31104b5..2ea1bb4 100644 --- a/code/lib/Loading_PudMed.py +++ b/code/lib/Loading_PudMed.py @@ -12,6 +12,7 @@ from io import StringIO import time sys.path.append('lib') +from http.client import IncompleteRead from Bio import Medline import os @@ -216,7 +217,7 @@ def search_full(self, ix, data_dir, search_results, starting, count, batch): data = fetch_handle.read() fetch_handle.close() out_handle.write(data) - time.sleep(2) # Delay between each batch fetch to respect the API rate limit + time.sleep(5) # Delay between each batch fetch to respect the API rate limit out_handle.close() def fetch_rec(self, rec_id, entrez_handle): diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc index 8f0ff86..5762f6d 100644 Binary files a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc and b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc differ diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py index 863cfc6..5ff824a 100644 --- a/code/step1_data_collection.py +++ b/code/step1_data_collection.py @@ -12,7 +12,7 @@ sys.path.append('lib') from lib.Literature_Data_Collection import literature_data_collection -years = 15 +years = 35 if len(sys.argv)>3: word_query = str(sys.argv[1]) @@ -52,7 +52,7 @@ ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up g2d_starting_point = 0 -batch_size = 1000 +batch_size = 100 ############################# ##################### gene_end_point = round(query_size/batch_size) diff --git a/code/step1_data_collection_Luis_genes.py b/code/step1_data_collection_Luis_genes.py new file mode 100644 index 0000000..c3ad685 --- /dev/null +++ b/code/step1_data_collection_Luis_genes.py @@ -0,0 +1,94 @@ +import requests +from Bio import Entrez +from io import BytesIO +import time +import os +import logging +import json + +class GenePubMedDownloader: + def __init__(self, api_key, email, output_dir, max_records_per_query=9999, checkpoint_file="checkpoint.json"): + self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + self.api_key = api_key + self.email = email + self.max_records_per_query = max_records_per_query + Entrez.email = email # Set email for NCBI E-utilities + self.output_dir = output_dir + os.makedirs(self.output_dir, exist_ok=True) + self.checkpoint_file = checkpoint_file + self.checkpoint_data = self.load_checkpoint() + + def fetch_pubmed_data(self, gene_name): + if gene_name in self.checkpoint_data: + logging.info(f"Skipping {gene_name} (already processed)") + return self.checkpoint_data[gene_name] + + normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_") + records = [] + attempt = 0 + max_attempts = 5 + while attempt < max_attempts: + try: + search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y" + search_response = requests.get(search_url, timeout=10) + if search_response.status_code == 200: + search_results = Entrez.read(BytesIO(search_response.content)) + webenv = search_results['WebEnv'] + query_key = search_results['QueryKey'] + count = int(search_results['Count']) + logging.info(f"Total records found for {gene_name}: {count}") + if count > 0: + for start in range(0, count, self.max_records_per_query): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax=min(count - start, self.max_records_per_query)&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url, timeout=10) + records.append(fetch_response.text) + logging.info(f"Fetched records for {gene_name} starting from {start}") + file_path = self.save_records_to_file(normalized_gene, records) + self.checkpoint_data[gene_name] = file_path + self.save_checkpoint() + return file_path + break + except requests.exceptions.RequestException as e: + attempt += 1 + logging.error(f"Attempt {attempt}: An error occurred: {e}") + time.sleep(2 ** attempt) + return [] + + def save_records_to_file(self, gene_name, records): + filename = f"{gene_name}.txt" + file_path = os.path.join(self.output_dir, filename) + with open(file_path, 'w', encoding='utf-8') as file: + file.write("\n".join(records)) + logging.info(f"Saved records for {gene_name} to {file_path}") + return file_path + + def load_checkpoint(self): + if os.path.exists(self.checkpoint_file): + with open(self.checkpoint_file, 'r') as file: + return json.load(file) + return {} + + def save_checkpoint(self): + with open(self.checkpoint_file, 'w') as file: + json.dump(self.checkpoint_data, file) + +def load_gene_names(file_path): + with open(file_path, 'r', encoding='utf-8') as file: + return [line.strip() for line in file if line.strip()] + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Example Usage +api_key = "19bea34a4dbdbc6ef30392cee15943365309" +email = "lrmercadod@gmail.com" +output_dir = "./gene_based_records/" +downloader = GenePubMedDownloader(api_key, email, output_dir) + +# Load gene names and symbols +full_names = load_gene_names('./data/gene_name_info/query_full_name.txt') +symbols = load_gene_names('./data/gene_name_info/query_symbol.txt') + +# Fetch records for each gene name and symbol +for gene in full_names + symbols: + downloader.fetch_pubmed_data(gene) \ No newline at end of file diff --git a/code/step_1_data_collection_Luis_.py b/code/step_1_data_collection_Luis_query.py similarity index 100% rename from code/step_1_data_collection_Luis_.py rename to code/step_1_data_collection_Luis_query.py