Skip to content

Commit

Permalink
Genes and Query extraction
Browse files Browse the repository at this point in the history
Implementation of an exogenous gene extraction method.
  • Loading branch information
lrm22005 committed May 8, 2024
1 parent 860d0fe commit f16f5d2
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 3 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ results/baseline_doc/pubmed.zinc.2.15.txt
results/baseline_doc/zinc AND 2013\[Date\].2013.txt
/results
config.ini
/gene_based_records
1 change: 1 addition & 0 deletions checkpoint.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"tumor protein p53": "./gene_based_records/tumor_protein_p53.txt", "epidermal growth factor receptor": "./gene_based_records/epidermal_growth_factor_receptor.txt", "apolipoprotein E": "./gene_based_records/apolipoprotein_E.txt", "tumor necrosis factor": "./gene_based_records/tumor_necrosis_factor.txt", "vascular endothelial growth factor A": "./gene_based_records/vascular_endothelial_growth_factor_A.txt", "interleukin 6": "./gene_based_records/interleukin_6.txt", "transforming growth factor beta 1": "./gene_based_records/transforming_growth_factor_beta_1.txt", "methylenetetrahydrofolate reductase": "./gene_based_records/methylenetetrahydrofolate_reductase.txt", "hypoxia inducible factor 1 subunit alpha": "./gene_based_records/hypoxia_inducible_factor_1_subunit_alpha.txt", "erb-b2 receptor tyrosine kinase 2": "./gene_based_records/erb-b2_receptor_tyrosine_kinase_2.txt", "estrogen receptor 1": "./gene_based_records/estrogen_receptor_1.txt", "interleukin 10": "./gene_based_records/interleukin_10.txt", "amyloid beta precursor protein": "./gene_based_records/amyloid_beta_precursor_protein.txt", "signal transducer and activator of transcription 3": "./gene_based_records/signal_transducer_and_activator_of_transcription_3.txt", "BRCA1 DNA repair associated": "./gene_based_records/BRCA1_DNA_repair_associated.txt", "angiotensin I converting enzyme": "./gene_based_records/angiotensin_I_converting_enzyme.txt", "KRAS proto-oncogene, GTPase": "./gene_based_records/KRAS_proto-oncogene,_GTPase.txt", "brain derived neurotrophic factor": "./gene_based_records/brain_derived_neurotrophic_factor.txt", "B-Raf proto-oncogene, serine/threonine kinase": "./gene_based_records/B-Raf_proto-oncogene,_serine_threonine_kinase.txt", "matrix metallopeptidase 9": "./gene_based_records/matrix_metallopeptidase_9.txt", "vitamin D receptor": "./gene_based_records/vitamin_D_receptor.txt", "C-reactive protein": "./gene_based_records/C-reactive_protein.txt", "CD274 molecule": "./gene_based_records/CD274_molecule.txt", "androgen receptor": "./gene_based_records/androgen_receptor.txt", "adiponectin, C1Q and collagen domain containing": "./gene_based_records/adiponectin,_C1Q_and_collagen_domain_containing.txt", "AKT serine/threonine kinase 1": "./gene_based_records/AKT_serine_threonine_kinase_1.txt", "ATP binding cassette subfamily B member 1": "./gene_based_records/ATP_binding_cassette_subfamily_B_member_1.txt", "nuclear factor kappa B subunit 1": "./gene_based_records/nuclear_factor_kappa_B_subunit_1.txt", "interleukin 1 beta": "./gene_based_records/interleukin_1_beta.txt", "major histocompatibility complex, class II, DR beta 1": "./gene_based_records/major_histocompatibility_complex,_class_II,_DR_beta_1.txt"}
3 changes: 2 additions & 1 deletion code/lib/Loading_PudMed.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from io import StringIO
import time
sys.path.append('lib')
from http.client import IncompleteRead

from Bio import Medline
import os
Expand Down Expand Up @@ -216,7 +217,7 @@ def search_full(self, ix, data_dir, search_results, starting, count, batch):
data = fetch_handle.read()
fetch_handle.close()
out_handle.write(data)
time.sleep(2) # Delay between each batch fetch to respect the API rate limit
time.sleep(5) # Delay between each batch fetch to respect the API rate limit
out_handle.close()

def fetch_rec(self, rec_id, entrez_handle):
Expand Down
Binary file modified code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
Binary file not shown.
4 changes: 2 additions & 2 deletions code/step1_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
sys.path.append('lib')
from lib.Literature_Data_Collection import literature_data_collection

years = 15
years = 35

if len(sys.argv)>3:
word_query = str(sys.argv[1])
Expand Down Expand Up @@ -52,7 +52,7 @@
ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up

g2d_starting_point = 0
batch_size = 1000
batch_size = 100
#############################
#####################
gene_end_point = round(query_size/batch_size)
Expand Down
94 changes: 94 additions & 0 deletions code/step1_data_collection_Luis_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import requests
from Bio import Entrez
from io import BytesIO
import time
import os
import logging
import json

class GenePubMedDownloader:
def __init__(self, api_key, email, output_dir, max_records_per_query=9999, checkpoint_file="checkpoint.json"):
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
self.api_key = api_key
self.email = email
self.max_records_per_query = max_records_per_query
Entrez.email = email # Set email for NCBI E-utilities
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)
self.checkpoint_file = checkpoint_file
self.checkpoint_data = self.load_checkpoint()

def fetch_pubmed_data(self, gene_name):
if gene_name in self.checkpoint_data:
logging.info(f"Skipping {gene_name} (already processed)")
return self.checkpoint_data[gene_name]

normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
records = []
attempt = 0
max_attempts = 5
while attempt < max_attempts:
try:
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
logging.info(f"Total records found for {gene_name}: {count}")
if count > 0:
for start in range(0, count, self.max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax=min(count - start, self.max_records_per_query)&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
logging.info(f"Fetched records for {gene_name} starting from {start}")
file_path = self.save_records_to_file(normalized_gene, records)
self.checkpoint_data[gene_name] = file_path
self.save_checkpoint()
return file_path
break
except requests.exceptions.RequestException as e:
attempt += 1
logging.error(f"Attempt {attempt}: An error occurred: {e}")
time.sleep(2 ** attempt)
return []

def save_records_to_file(self, gene_name, records):
filename = f"{gene_name}.txt"
file_path = os.path.join(self.output_dir, filename)
with open(file_path, 'w', encoding='utf-8') as file:
file.write("\n".join(records))
logging.info(f"Saved records for {gene_name} to {file_path}")
return file_path

def load_checkpoint(self):
if os.path.exists(self.checkpoint_file):
with open(self.checkpoint_file, 'r') as file:
return json.load(file)
return {}

def save_checkpoint(self):
with open(self.checkpoint_file, 'w') as file:
json.dump(self.checkpoint_data, file)

def load_gene_names(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return [line.strip() for line in file if line.strip()]

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Example Usage
api_key = "19bea34a4dbdbc6ef30392cee15943365309"
email = "lrmercadod@gmail.com"
output_dir = "./gene_based_records/"
downloader = GenePubMedDownloader(api_key, email, output_dir)

# Load gene names and symbols
full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')

# Fetch records for each gene name and symbol
for gene in full_names + symbols:
downloader.fetch_pubmed_data(gene)
File renamed without changes.

0 comments on commit f16f5d2

Please sign in to comment.