From f16f5d27a20bf950c8efb023161000013776a62d Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Wed, 8 May 2024 08:09:14 -0400 Subject: [PATCH] Genes and Query extraction Implementation of an exogenous gene extraction method. --- .gitignore | 1 + checkpoint.json | 1 + code/lib/Loading_PudMed.py | 3 +- .../Loading_PudMed.cpython-311.pyc | Bin 11092 -> 11158 bytes code/step1_data_collection.py | 4 +- code/step1_data_collection_Luis_genes.py | 94 ++++++++++++++++++ ...y => step_1_data_collection_Luis_query.py} | 0 7 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 checkpoint.json create mode 100644 code/step1_data_collection_Luis_genes.py rename code/{step_1_data_collection_Luis_.py => step_1_data_collection_Luis_query.py} (100%) diff --git a/.gitignore b/.gitignore index 8dc7703..7531fff 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,4 @@ results/baseline_doc/pubmed.zinc.2.15.txt results/baseline_doc/zinc AND 2013\[Date\].2013.txt /results config.ini +/gene_based_records diff --git a/checkpoint.json b/checkpoint.json new file mode 100644 index 0000000..c4d6d57 --- /dev/null +++ b/checkpoint.json @@ -0,0 +1 @@ +{"tumor protein p53": "./gene_based_records/tumor_protein_p53.txt", "epidermal growth factor receptor": "./gene_based_records/epidermal_growth_factor_receptor.txt", "apolipoprotein E": "./gene_based_records/apolipoprotein_E.txt", "tumor necrosis factor": "./gene_based_records/tumor_necrosis_factor.txt", "vascular endothelial growth factor A": "./gene_based_records/vascular_endothelial_growth_factor_A.txt", "interleukin 6": "./gene_based_records/interleukin_6.txt", "transforming growth factor beta 1": "./gene_based_records/transforming_growth_factor_beta_1.txt", "methylenetetrahydrofolate reductase": "./gene_based_records/methylenetetrahydrofolate_reductase.txt", "hypoxia inducible factor 1 subunit alpha": "./gene_based_records/hypoxia_inducible_factor_1_subunit_alpha.txt", "erb-b2 receptor tyrosine kinase 2": "./gene_based_records/erb-b2_receptor_tyrosine_kinase_2.txt", "estrogen receptor 1": "./gene_based_records/estrogen_receptor_1.txt", "interleukin 10": "./gene_based_records/interleukin_10.txt", "amyloid beta precursor protein": "./gene_based_records/amyloid_beta_precursor_protein.txt", "signal transducer and activator of transcription 3": "./gene_based_records/signal_transducer_and_activator_of_transcription_3.txt", "BRCA1 DNA repair associated": "./gene_based_records/BRCA1_DNA_repair_associated.txt", "angiotensin I converting enzyme": "./gene_based_records/angiotensin_I_converting_enzyme.txt", "KRAS proto-oncogene, GTPase": "./gene_based_records/KRAS_proto-oncogene,_GTPase.txt", "brain derived neurotrophic factor": "./gene_based_records/brain_derived_neurotrophic_factor.txt", "B-Raf proto-oncogene, serine/threonine kinase": "./gene_based_records/B-Raf_proto-oncogene,_serine_threonine_kinase.txt", "matrix metallopeptidase 9": "./gene_based_records/matrix_metallopeptidase_9.txt", "vitamin D receptor": "./gene_based_records/vitamin_D_receptor.txt", "C-reactive protein": "./gene_based_records/C-reactive_protein.txt", "CD274 molecule": "./gene_based_records/CD274_molecule.txt", "androgen receptor": "./gene_based_records/androgen_receptor.txt", "adiponectin, C1Q and collagen domain containing": "./gene_based_records/adiponectin,_C1Q_and_collagen_domain_containing.txt", "AKT serine/threonine kinase 1": "./gene_based_records/AKT_serine_threonine_kinase_1.txt", "ATP binding cassette subfamily B member 1": "./gene_based_records/ATP_binding_cassette_subfamily_B_member_1.txt", "nuclear factor kappa B subunit 1": "./gene_based_records/nuclear_factor_kappa_B_subunit_1.txt", "interleukin 1 beta": "./gene_based_records/interleukin_1_beta.txt", "major histocompatibility complex, class II, DR beta 1": "./gene_based_records/major_histocompatibility_complex,_class_II,_DR_beta_1.txt"} \ No newline at end of file diff --git a/code/lib/Loading_PudMed.py b/code/lib/Loading_PudMed.py index 31104b5..2ea1bb4 100644 --- a/code/lib/Loading_PudMed.py +++ b/code/lib/Loading_PudMed.py @@ -12,6 +12,7 @@ from io import StringIO import time sys.path.append('lib') +from http.client import IncompleteRead from Bio import Medline import os @@ -216,7 +217,7 @@ def search_full(self, ix, data_dir, search_results, starting, count, batch): data = fetch_handle.read() fetch_handle.close() out_handle.write(data) - time.sleep(2) # Delay between each batch fetch to respect the API rate limit + time.sleep(5) # Delay between each batch fetch to respect the API rate limit out_handle.close() def fetch_rec(self, rec_id, entrez_handle): diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc index 8f0ff861105a8459c5fdcf4f477c92fa227c55e1..5762f6d72098af2798e59043224cd2bbb654b163 100644 GIT binary patch delta 1422 zcmY+EO>7%Q6vubg-%gy^jvJ?G+B(EZydi`@OOa>@{ZP_I6bdf+kX2+Y-f_Iqez)Ai%^wVSE^?5~@X*Yj~JW%pJ3m|`$PwfZals=^~1X~Cj{>L(fT zOD1>BNZetHau=WOXtPSTGN3AF6eD>}F?#N(IEznrRC0!TLD71}2($b9mYkYvH!be- zGS`jHTkJK)dd1J`mn`4;FtiOd*C<=)(`n|-$NQxYNgbX1|NF%fi42TDbQI;0J zB|c+0aVt5&3Y~9~msB>`Ihs1H9vi0E65#;h5yYa#Wc;#jnTGCj7$bL)K;^hDf%facYL0Gm*dQ}xVsLEF27Ovd6QhNvP}^8Y0RoBB^>3$30diE$eCcesiW~nN)ZT-7<8_bi2a5BeGLP uEm-e1CG5WA&-k3}8ZC>@!wyc+KcuHHQJrdMzgH%{{r^XNH?r$-Vg2%^}}8 znLC4>mym)06+soPRD_UFTcK8{2p;@^5Ptwdyc7uuaSy0ZJR*4IiJ83$veJESZ)SGy zyEC)5R(`o++&7H0g5x*+NbB&9F~$DcS@>W`Z5wq%Re1c3J%a6-P;ZZj?O1Lpd5et?)fc8{p9R7+;Q-={$-*SAb!;o(@Fbb%2qYZ}grfLqB!9j{ z(`N`K;aP+^EX9Bn`y5$GA{<483BT*Q7WCm5nO-E63C9U1#8PHI3an+GXDfpnnF>RJ z4@XbNvm`Lom+f8!dQH9_I2|ucEMKSqbRg9H08Bl~%nEHT7weDLQaVD$eI4Jka=;h`T^4`O zJi%@cGWl(FYn6gx>sS$J;?#_RY!I8wNvc!OU+2JORIA%=J1(}N8Q&F^0N$1b^)M)k zjGyjwy}(R|sjknVfyH(sX|X=`QkalddlfVVYa5-hCI8@AE(eFEnu8l8b{);9-Am&5 z>=GugJ6mF3iO**rVeYmLe4?_7m@F*o^AxixT7?Bn=EsGjRq8A=8^${8+zZu?k0-T) zHCZ_bC>RLzL+BE&AVSS`Jb0QuHTu+fiwDiM$E6VaiqmuQUZ_q_X2@#T4!~O!n)Cw; z0>`@seer6shC;tCme>!2UyB~oa~RRo#LxM0(O9@H6A!Yxw&+)H6Sfgp&*+Mh;X2_R z!h3}G5vC?(!AE4E#>8Tf9(WBA8r%zjZ#LE}&u(-0lpL=k{5WD)57flf(iHn$Y?f*) zCmxm#mB%UcB;h5(Rl;S$CgC$e4e_M{bdac%QZysv#1ZrEkaW3: word_query = str(sys.argv[1]) @@ -52,7 +52,7 @@ ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up g2d_starting_point = 0 -batch_size = 1000 +batch_size = 100 ############################# ##################### gene_end_point = round(query_size/batch_size) diff --git a/code/step1_data_collection_Luis_genes.py b/code/step1_data_collection_Luis_genes.py new file mode 100644 index 0000000..c3ad685 --- /dev/null +++ b/code/step1_data_collection_Luis_genes.py @@ -0,0 +1,94 @@ +import requests +from Bio import Entrez +from io import BytesIO +import time +import os +import logging +import json + +class GenePubMedDownloader: + def __init__(self, api_key, email, output_dir, max_records_per_query=9999, checkpoint_file="checkpoint.json"): + self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + self.api_key = api_key + self.email = email + self.max_records_per_query = max_records_per_query + Entrez.email = email # Set email for NCBI E-utilities + self.output_dir = output_dir + os.makedirs(self.output_dir, exist_ok=True) + self.checkpoint_file = checkpoint_file + self.checkpoint_data = self.load_checkpoint() + + def fetch_pubmed_data(self, gene_name): + if gene_name in self.checkpoint_data: + logging.info(f"Skipping {gene_name} (already processed)") + return self.checkpoint_data[gene_name] + + normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_") + records = [] + attempt = 0 + max_attempts = 5 + while attempt < max_attempts: + try: + search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y" + search_response = requests.get(search_url, timeout=10) + if search_response.status_code == 200: + search_results = Entrez.read(BytesIO(search_response.content)) + webenv = search_results['WebEnv'] + query_key = search_results['QueryKey'] + count = int(search_results['Count']) + logging.info(f"Total records found for {gene_name}: {count}") + if count > 0: + for start in range(0, count, self.max_records_per_query): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax=min(count - start, self.max_records_per_query)&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url, timeout=10) + records.append(fetch_response.text) + logging.info(f"Fetched records for {gene_name} starting from {start}") + file_path = self.save_records_to_file(normalized_gene, records) + self.checkpoint_data[gene_name] = file_path + self.save_checkpoint() + return file_path + break + except requests.exceptions.RequestException as e: + attempt += 1 + logging.error(f"Attempt {attempt}: An error occurred: {e}") + time.sleep(2 ** attempt) + return [] + + def save_records_to_file(self, gene_name, records): + filename = f"{gene_name}.txt" + file_path = os.path.join(self.output_dir, filename) + with open(file_path, 'w', encoding='utf-8') as file: + file.write("\n".join(records)) + logging.info(f"Saved records for {gene_name} to {file_path}") + return file_path + + def load_checkpoint(self): + if os.path.exists(self.checkpoint_file): + with open(self.checkpoint_file, 'r') as file: + return json.load(file) + return {} + + def save_checkpoint(self): + with open(self.checkpoint_file, 'w') as file: + json.dump(self.checkpoint_data, file) + +def load_gene_names(file_path): + with open(file_path, 'r', encoding='utf-8') as file: + return [line.strip() for line in file if line.strip()] + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Example Usage +api_key = "19bea34a4dbdbc6ef30392cee15943365309" +email = "lrmercadod@gmail.com" +output_dir = "./gene_based_records/" +downloader = GenePubMedDownloader(api_key, email, output_dir) + +# Load gene names and symbols +full_names = load_gene_names('./data/gene_name_info/query_full_name.txt') +symbols = load_gene_names('./data/gene_name_info/query_symbol.txt') + +# Fetch records for each gene name and symbol +for gene in full_names + symbols: + downloader.fetch_pubmed_data(gene) \ No newline at end of file diff --git a/code/step_1_data_collection_Luis_.py b/code/step_1_data_collection_Luis_query.py similarity index 100% rename from code/step_1_data_collection_Luis_.py rename to code/step_1_data_collection_Luis_query.py