Merge pull request #2 from lrm22005/Luis

Luis
lrm22005 · May 6, 2024 · 1655990 · 1655990
2 parents e107ed9 + fe91786
commit 1655990
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,35 @@ data/gene_name_info/query_snps.txt
 data/gene_name_info/query_symbol.txt
 results/baseline_doc/pubmed.zinc.0.15.txt
 results/baseline_doc/pubmed.zinc.1.15.txt
+code/step_1_data_collection_Luis.py
+results/baseline_doc/pubmed.zinc.0.full.txt
+results/baseline_doc/pubmed.zinc.2.15.txt
+results/baseline_doc/zinc AND 1990\[Date\].1990.txt
+results/baseline_doc/zinc AND 1991\[Date\].1991.txt
+results/baseline_doc/zinc AND 1992\[Date\].1992.txt
+results/baseline_doc/zinc AND 1994\[Date\].1994.txt
+results/baseline_doc/zinc AND 1993\[Date\].1993.txt
+results/baseline_doc/zinc AND 1995\[Date\].1995.txt
+results/baseline_doc/zinc AND 1996\[Date\].1996.txt
+results/baseline_doc/zinc AND 1997\[Date\].1997.txt
+results/baseline_doc/zinc AND 1998\[Date\].1998.txt
+results/baseline_doc/zinc AND 1999\[Date\].1999.txt
+results/baseline_doc/zinc AND 2000\[Date\].2000.txt
+results/baseline_doc/zinc AND 2001\[Date\].2001.txt
+results/baseline_doc/zinc AND 2002\[Date\].2002.txt
+results/baseline_doc/zinc AND 2003\[Date\].2003.txt
+results/baseline_doc/zinc AND 2004\[Date\].2004.txt
+results/baseline_doc/zinc AND 2005\[Date\].2005.txt
+results/baseline_doc/zinc AND 2006\[Date\].2006.txt
+results/baseline_doc/zinc AND 2007\[Date\].2007.txt
+results/baseline_doc/zinc AND 2009\[Date\].2009.txt
+results/baseline_doc/zinc AND 2008\[Date\].2008.txt
+results/baseline_doc/zinc AND 2010\[Date\].2010.txt
+results/baseline_doc/zinc AND 2011\[Date\].2011.txt
+results/baseline_doc/pubmed.zinc.0.full.txt
+results/baseline_doc/pubmed.zinc.2.15.txt
+results/baseline_doc/zinc AND 2012\[Date\].2012.txt
+results/baseline_doc/pubmed.zinc.0.full.txt
+results/baseline_doc/pubmed.zinc.2.15.txt
+results/baseline_doc/zinc AND 2013\[Date\].2013.txt
+/results
diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py
@@ -1,8 +1,34 @@
+"""
+Code created by: lrmercadod
+Date: 5/6/2024 10:43:45
+PubMed Record Fetcher and Saver
+
+This script is designed to automate the retrieval of PubMed records based on a specific topic and year. It uses the NCBI E-utilities API to fetch data in MEDLINE format and saves each year's data in a separate text file within a structured directory.
+
+Features:
+- Fetches PubMed records using a combination of the topic and year to form a query.
+- Retrieves data in MEDLINE format, which includes structured bibliographic information.
+- Saves the fetched data into text files, organizing them by topic and year under the './results/baseline_doc' directory.
+- Handles network and API request errors by implementing retry logic with exponential backoff.
+
+Usage:
+- The user must provide an NCBI API key and email for using NCBI's E-utilities.
+- Modify the 'topic' variable and the year range in the script to fetch records for different topics or years.
+
+Dependencies:
+- BioPython for interacting with NCBI's E-utilities.
+- requests for making HTTP requests.
+
+Example:
+To use the script, simply run it in a Python environment with the necessary dependencies installed. Ensure that the API key and email are correctly set up in the script.
+"""
 import requests
 from Bio import Entrez
 from io import StringIO
 from Bio import Medline
 from io import BytesIO
+import time
+import os
 
 class PubMedDownloader:
     def __init__(self, api_key, email):
@@ -11,36 +37,47 @@ def __init__(self, api_key, email):
         self.email = email
         Entrez.email = email  # Setting email for Biopython Entrez
 
-    def fetch_pubmed_data(self, query, batch_size=10000):
-        search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
-        search_response = requests.get(search_url)
-        if search_response.status_code == 200:
-            try:
-                # Use BytesIO for binary data
-                search_results = Entrez.read(BytesIO(search_response.content))
-                webenv = search_results['WebEnv']
-                query_key = search_results['QueryKey']
-                count = int(search_results['Count'])
-                print(f"Total records found: {count}")
-            except Exception as e:
-                print("Error reading search results:", e)
-                return []
-        else:
-            print("Failed to retrieve search results")
-            return []
-
+    def fetch_pubmed_data(self, query, year, max_records_per_query=9999):
         records = []
-        for start in range(0, count, batch_size):
-            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
-            fetch_response = requests.get(fetch_url)
-            if fetch_response.status_code == 200:
-                records.extend(fetch_response.content.decode('utf-8').split('\n\n'))  # Each record separated by two newlines
-                print(f"Fetched {start + batch_size} of {count} records")
-            else:
-                print(f"Failed to fetch data for batch starting at {start}")
+        attempt = 0
+        max_attempts = 5
 
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    webenv = search_results['WebEnv']
+                    query_key = search_results['QueryKey']
+                    count = int(search_results['Count'])
+                    print(f"Total records found for the query '{query}': {count}")
+
+                    for start in range(0, min(count, max_records_per_query), max_records_per_query):
+                        fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                        fetch_response = requests.get(fetch_url, timeout=10)
+                        records.append(fetch_response.text)
+                        print(f"Fetched records starting from {start}")
+                    break
+                else:
+                    print(f"Failed to initiate search with status {search_response.status_code}")
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                print(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)  # Exponential backoff
+        # Save records to a file
+        self.save_records_to_file(query, year, records)
         return records
 
+    def save_records_to_file(self, query, year, records):
+        directory = f"./results/baseline_doc"
+        os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
+        filename = f"{query}.{year}.txt"
+        file_path = os.path.join(directory, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))  # Each record is separated by a newline
+        print(f"Saved records to {file_path}")
+
 class ids_pubmed():
     def __init__(self):
         self.snp_ids = []
@@ -104,4 +141,7 @@ def search_ids(self, search_email):
 
 downloader = PubMedDownloader(api_key, email)
 topic = "zinc"  # Define the topic of interest
-pubmed_records = downloader.fetch_pubmed_data(topic, 10000)  # Adjust batch size as needed
+# Fetch and save records by year
+for year in range(1990, 2023):  # Example range of years
+    year_query = f"{topic} AND {year}[Date]"
+    downloader.fetch_pubmed_data(year_query, year)
diff --git a/pubmed_data.txt b/pubmed_data.txt
diff --git a/results/baseline_doc/pubmed.zinc.0.full.txt b/results/baseline_doc/pubmed.zinc.0.full.txt
diff --git a/results/baseline_doc/pubmed.zinc.2.15.txt b/results/baseline_doc/pubmed.zinc.2.15.txt