From 510e5e21cc3ffedc3633af3f22e9bcd94604d04a Mon Sep 17 00:00:00 2001
From: Luis Roberto Mercado Diaz <luis.mercado_diaz@uconn.edu>
Date: Mon, 6 May 2024 12:27:48 -0400
Subject: [PATCH] step_1_data_collection_Luis.py

Enhance PubMed Data Fetching and Saving Mechanism

This commit introduces several enhancements to the PubMedDownloader class, improving its functionality and usability:

1. **Dynamic Year Querying**: Added support for dynamic querying by year. This allows users to specify a range of years for which the PubMed records should be fetched.

2. **Structured Data Saving**: Implemented functionality to save the fetched PubMed records in MEDLINE format. Each year's data is saved in a separate text file, named according to the query and the year, facilitating easier data management and retrieval.

3. **Error Handling**: Enhanced error handling capabilities to manage network issues and API limitations more robustly. This includes retry mechanisms with exponential backoff and timeout settings to prevent hanging requests.

4. **Directory Management**: Automated directory creation for storing the output files, ensuring that the user does not need to manually create directories before running the script.

These enhancements make the script more robust and user-friendly, suitable for handling large-scale data retrieval tasks in biomedical research environments.
---
 code/step_1_data_collection_Luis.py | 70 ++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 27 deletions(-)

diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py
index a18473b..5945465 100644
--- a/code/step_1_data_collection_Luis.py
+++ b/code/step_1_data_collection_Luis.py
@@ -3,6 +3,8 @@
 from io import StringIO
 from Bio import Medline
 from io import BytesIO
+import time
+import os
 
 class PubMedDownloader:
     def __init__(self, api_key, email):
@@ -11,36 +13,47 @@ def __init__(self, api_key, email):
         self.email = email
         Entrez.email = email  # Setting email for Biopython Entrez
 
-    def fetch_pubmed_data(self, query, batch_size=10000):
-        search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
-        search_response = requests.get(search_url)
-        if search_response.status_code == 200:
-            try:
-                # Use BytesIO for binary data
-                search_results = Entrez.read(BytesIO(search_response.content))
-                webenv = search_results['WebEnv']
-                query_key = search_results['QueryKey']
-                count = int(search_results['Count'])
-                print(f"Total records found: {count}")
-            except Exception as e:
-                print("Error reading search results:", e)
-                return []
-        else:
-            print("Failed to retrieve search results")
-            return []
-
+    def fetch_pubmed_data(self, query, year, max_records_per_query=9999):
         records = []
-        for start in range(0, count, batch_size):
-            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
-            fetch_response = requests.get(fetch_url)
-            if fetch_response.status_code == 200:
-                records.extend(fetch_response.content.decode('utf-8').split('\n\n'))  # Each record separated by two newlines
-                print(f"Fetched {start + batch_size} of {count} records")
-            else:
-                print(f"Failed to fetch data for batch starting at {start}")
+        attempt = 0
+        max_attempts = 5
 
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    webenv = search_results['WebEnv']
+                    query_key = search_results['QueryKey']
+                    count = int(search_results['Count'])
+                    print(f"Total records found for the query '{query}': {count}")
+
+                    for start in range(0, min(count, max_records_per_query), max_records_per_query):
+                        fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                        fetch_response = requests.get(fetch_url, timeout=10)
+                        records.append(fetch_response.text)
+                        print(f"Fetched records starting from {start}")
+                    break
+                else:
+                    print(f"Failed to initiate search with status {search_response.status_code}")
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                print(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)  # Exponential backoff
+        # Save records to a file
+        self.save_records_to_file(query, year, records)
         return records
 
+    def save_records_to_file(self, query, year, records):
+        directory = f"./results/baseline_doc"
+        os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
+        filename = f"{query}.{year}.txt"
+        file_path = os.path.join(directory, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))  # Each record is separated by a newline
+        print(f"Saved records to {file_path}")
+
 class ids_pubmed():
     def __init__(self):
         self.snp_ids = []
@@ -104,4 +117,7 @@ def search_ids(self, search_email):
 
 downloader = PubMedDownloader(api_key, email)
 topic = "zinc"  # Define the topic of interest
-pubmed_records = downloader.fetch_pubmed_data(topic, 10000)  # Adjust batch size as needed
\ No newline at end of file
+# Fetch and save records by year
+for year in range(1990, 2023):  # Example range of years
+    year_query = f"{topic} AND {year}[Date]"
+    downloader.fetch_pubmed_data(year_query, year)
\ No newline at end of file