step_1_data_collection_Luis.py

Enhance PubMed Data Fetching and Saving Mechanism This commit introduces several enhancements to the PubMedDownloader class, improving its functionality and usability: 1. **Dynamic Year Querying**: Added support for dynamic querying by year. This allows users to specify a range of years for which the PubMed records should be fetched. 2. **Structured Data Saving**: Implemented functionality to save the fetched PubMed records in MEDLINE format. Each year's data is saved in a separate text file, named according to the query and the year, facilitating easier data management and retrieval. 3. **Error Handling**: Enhanced error handling capabilities to manage network issues and API limitations more robustly. This includes retry mechanisms with exponential backoff and timeout settings to prevent hanging requests. 4. **Directory Management**: Automated directory creation for storing the output files, ensuring that the user does not need to manually create directories before running the script. These enhancements make the script more robust and user-friendly, suitable for handling large-scale data retrieval tasks in biomedical research environments.
lrm22005 · May 6, 2024 · 510e5e2 · 510e5e2
1 parent afd4e22
commit 510e5e2
Showing 1 changed file with 43 additions and 27 deletions.
diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py
@@ -3,6 +3,8 @@
 from io import StringIO
 from Bio import Medline
 from io import BytesIO
+import time
+import os
 
 class PubMedDownloader:
     def __init__(self, api_key, email):
@@ -11,36 +13,47 @@ def __init__(self, api_key, email):
         self.email = email
         Entrez.email = email  # Setting email for Biopython Entrez
 
-    def fetch_pubmed_data(self, query, batch_size=10000):
-        search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
-        search_response = requests.get(search_url)
-        if search_response.status_code == 200:
-            try:
-                # Use BytesIO for binary data
-                search_results = Entrez.read(BytesIO(search_response.content))
-                webenv = search_results['WebEnv']
-                query_key = search_results['QueryKey']
-                count = int(search_results['Count'])
-                print(f"Total records found: {count}")
-            except Exception as e:
-                print("Error reading search results:", e)
-                return []
-        else:
-            print("Failed to retrieve search results")
-            return []
-
+    def fetch_pubmed_data(self, query, year, max_records_per_query=9999):
         records = []
-        for start in range(0, count, batch_size):
-            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
-            fetch_response = requests.get(fetch_url)
-            if fetch_response.status_code == 200:
-                records.extend(fetch_response.content.decode('utf-8').split('\n\n'))  # Each record separated by two newlines
-                print(f"Fetched {start + batch_size} of {count} records")
-            else:
-                print(f"Failed to fetch data for batch starting at {start}")
+        attempt = 0
+        max_attempts = 5
 
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    webenv = search_results['WebEnv']
+                    query_key = search_results['QueryKey']
+                    count = int(search_results['Count'])
+                    print(f"Total records found for the query '{query}': {count}")
+
+                    for start in range(0, min(count, max_records_per_query), max_records_per_query):
+                        fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                        fetch_response = requests.get(fetch_url, timeout=10)
+                        records.append(fetch_response.text)
+                        print(f"Fetched records starting from {start}")
+                    break
+                else:
+                    print(f"Failed to initiate search with status {search_response.status_code}")
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                print(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)  # Exponential backoff
+        # Save records to a file
+        self.save_records_to_file(query, year, records)
         return records
 
+    def save_records_to_file(self, query, year, records):
+        directory = f"./results/baseline_doc"
+        os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
+        filename = f"{query}.{year}.txt"
+        file_path = os.path.join(directory, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))  # Each record is separated by a newline
+        print(f"Saved records to {file_path}")
+
 class ids_pubmed():
     def __init__(self):
         self.snp_ids = []
@@ -104,4 +117,7 @@ def search_ids(self, search_email):
 
 downloader = PubMedDownloader(api_key, email)
 topic = "zinc"  # Define the topic of interest
-pubmed_records = downloader.fetch_pubmed_data(topic, 10000)  # Adjust batch size as needed
+# Fetch and save records by year
+for year in range(1990, 2023):  # Example range of years
+    year_query = f"{topic} AND {year}[Date]"
+    downloader.fetch_pubmed_data(year_query, year)