Skip to content

Commit

Permalink
step_1_data_collection_Luis.py
Browse files Browse the repository at this point in the history
Enhance PubMed Data Fetching and Saving Mechanism

This commit introduces several enhancements to the PubMedDownloader class, improving its functionality and usability:

1. **Dynamic Year Querying**: Added support for dynamic querying by year. This allows users to specify a range of years for which the PubMed records should be fetched.

2. **Structured Data Saving**: Implemented functionality to save the fetched PubMed records in MEDLINE format. Each year's data is saved in a separate text file, named according to the query and the year, facilitating easier data management and retrieval.

3. **Error Handling**: Enhanced error handling capabilities to manage network issues and API limitations more robustly. This includes retry mechanisms with exponential backoff and timeout settings to prevent hanging requests.

4. **Directory Management**: Automated directory creation for storing the output files, ensuring that the user does not need to manually create directories before running the script.

These enhancements make the script more robust and user-friendly, suitable for handling large-scale data retrieval tasks in biomedical research environments.
  • Loading branch information
lrm22005 committed May 6, 2024
1 parent afd4e22 commit 510e5e2
Showing 1 changed file with 43 additions and 27 deletions.
70 changes: 43 additions & 27 deletions code/step_1_data_collection_Luis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from io import StringIO
from Bio import Medline
from io import BytesIO
import time
import os

class PubMedDownloader:
def __init__(self, api_key, email):
Expand All @@ -11,36 +13,47 @@ def __init__(self, api_key, email):
self.email = email
Entrez.email = email # Setting email for Biopython Entrez

def fetch_pubmed_data(self, query, batch_size=10000):
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url)
if search_response.status_code == 200:
try:
# Use BytesIO for binary data
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
print(f"Total records found: {count}")
except Exception as e:
print("Error reading search results:", e)
return []
else:
print("Failed to retrieve search results")
return []

def fetch_pubmed_data(self, query, year, max_records_per_query=9999):
records = []
for start in range(0, count, batch_size):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url)
if fetch_response.status_code == 200:
records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines
print(f"Fetched {start + batch_size} of {count} records")
else:
print(f"Failed to fetch data for batch starting at {start}")
attempt = 0
max_attempts = 5

while attempt < max_attempts:
try:
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
print(f"Total records found for the query '{query}': {count}")

for start in range(0, min(count, max_records_per_query), max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
print(f"Fetched records starting from {start}")
break
else:
print(f"Failed to initiate search with status {search_response.status_code}")
except requests.exceptions.RequestException as e:
attempt += 1
print(f"Attempt {attempt}: An error occurred: {e}")
time.sleep(2 ** attempt) # Exponential backoff
# Save records to a file
self.save_records_to_file(query, year, records)
return records

def save_records_to_file(self, query, year, records):
directory = f"./results/baseline_doc"
os.makedirs(directory, exist_ok=True) # Create directory if it doesn't exist
filename = f"{query}.{year}.txt"
file_path = os.path.join(directory, filename)
with open(file_path, 'w', encoding='utf-8') as file:
file.write("\n".join(records)) # Each record is separated by a newline
print(f"Saved records to {file_path}")

class ids_pubmed():
def __init__(self):
self.snp_ids = []
Expand Down Expand Up @@ -104,4 +117,7 @@ def search_ids(self, search_email):

downloader = PubMedDownloader(api_key, email)
topic = "zinc" # Define the topic of interest
pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed
# Fetch and save records by year
for year in range(1990, 2023): # Example range of years
year_query = f"{topic} AND {year}[Date]"
downloader.fetch_pubmed_data(year_query, year)

0 comments on commit 510e5e2

Please sign in to comment.