Skip to content

Commit

Permalink
Gene downloading
Browse files Browse the repository at this point in the history
Downloader for Gene data, interact with multiple NCBI sources, without limits for connection.
  • Loading branch information
lrm22005 committed May 14, 2024
1 parent f16f5d2 commit 1613e69
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 15 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ results/baseline_doc/zinc AND 2013\[Date\].2013.txt
/results
config.ini
/gene_based_records
checkpoint.json
checkpoint.json
2 changes: 1 addition & 1 deletion checkpoint.json

Large diffs are not rendered by default.

35 changes: 21 additions & 14 deletions code/step1_data_collection_Luis_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def __init__(self, api_key, email, output_dir, max_records_per_query=9999, check
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
self.api_key = api_key
self.email = email
self.max_records_per_query = max_records_per_query
Entrez.email = email # Set email for NCBI E-utilities
self.max_records_per_query = max_records_per_query
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)
self.checkpoint_file = checkpoint_file
Expand All @@ -21,28 +21,35 @@ def __init__(self, api_key, email, output_dir, max_records_per_query=9999, check
def fetch_pubmed_data(self, gene_name):
if gene_name in self.checkpoint_data:
logging.info(f"Skipping {gene_name} (already processed)")
return self.checkpoint_data[gene_name]

return self.checkpoint_data[gene_name]
normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
records = []
attempt = 0
max_attempts = 5

while attempt < max_attempts:
try:
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
logging.info(f"Total records found for {gene_name}: {count}")
if count > 0:
for start in range(0, count, self.max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax=min(count - start, self.max_records_per_query)&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
logging.info(f"Fetched records for {gene_name} starting from {start}")
if 'WebEnv' in search_results and 'QueryKey' in search_results:
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
logging.info(f"Total records found for {gene_name}: {count}")
if count > 0:
for start in range(0, count, self.max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
logging.info(f"Fetched records for {gene_name} starting from {start}")
else:
logging.info(f"No records found for {gene_name}.")
return None
else:
logging.error(f"No WebEnv/QueryKey found in the search results for {gene_name}.")
return None
file_path = self.save_records_to_file(normalized_gene, records)
self.checkpoint_data[gene_name] = file_path
self.save_checkpoint()
Expand All @@ -52,7 +59,7 @@ def fetch_pubmed_data(self, gene_name):
attempt += 1
logging.error(f"Attempt {attempt}: An error occurred: {e}")
time.sleep(2 ** attempt)
return []
return None

def save_records_to_file(self, gene_name, records):
filename = f"{gene_name}.txt"
Expand Down

0 comments on commit 1613e69

Please sign in to comment.