diff --git a/.gitignore b/.gitignore index e0116c1..9f9b25a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,35 @@ data/gene_name_info/query_snps.txt data/gene_name_info/query_symbol.txt results/baseline_doc/pubmed.zinc.0.15.txt results/baseline_doc/pubmed.zinc.1.15.txt +code/step_1_data_collection_Luis.py +results/baseline_doc/pubmed.zinc.0.full.txt +results/baseline_doc/pubmed.zinc.2.15.txt +results/baseline_doc/zinc AND 1990\[Date\].1990.txt +results/baseline_doc/zinc AND 1991\[Date\].1991.txt +results/baseline_doc/zinc AND 1992\[Date\].1992.txt +results/baseline_doc/zinc AND 1994\[Date\].1994.txt +results/baseline_doc/zinc AND 1993\[Date\].1993.txt +results/baseline_doc/zinc AND 1995\[Date\].1995.txt +results/baseline_doc/zinc AND 1996\[Date\].1996.txt +results/baseline_doc/zinc AND 1997\[Date\].1997.txt +results/baseline_doc/zinc AND 1998\[Date\].1998.txt +results/baseline_doc/zinc AND 1999\[Date\].1999.txt +results/baseline_doc/zinc AND 2000\[Date\].2000.txt +results/baseline_doc/zinc AND 2001\[Date\].2001.txt +results/baseline_doc/zinc AND 2002\[Date\].2002.txt +results/baseline_doc/zinc AND 2003\[Date\].2003.txt +results/baseline_doc/zinc AND 2004\[Date\].2004.txt +results/baseline_doc/zinc AND 2005\[Date\].2005.txt +results/baseline_doc/zinc AND 2006\[Date\].2006.txt +results/baseline_doc/zinc AND 2007\[Date\].2007.txt +results/baseline_doc/zinc AND 2009\[Date\].2009.txt +results/baseline_doc/zinc AND 2008\[Date\].2008.txt +results/baseline_doc/zinc AND 2010\[Date\].2010.txt +results/baseline_doc/zinc AND 2011\[Date\].2011.txt +results/baseline_doc/pubmed.zinc.0.full.txt +results/baseline_doc/pubmed.zinc.2.15.txt +results/baseline_doc/zinc AND 2012\[Date\].2012.txt +results/baseline_doc/pubmed.zinc.0.full.txt +results/baseline_doc/pubmed.zinc.2.15.txt +results/baseline_doc/zinc AND 2013\[Date\].2013.txt +/results diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py index a18473b..eb607da 100644 --- a/code/step_1_data_collection_Luis.py +++ b/code/step_1_data_collection_Luis.py @@ -1,8 +1,34 @@ +""" +Code created by: lrmercadod +Date: 5/6/2024 10:43:45 +PubMed Record Fetcher and Saver + +This script is designed to automate the retrieval of PubMed records based on a specific topic and year. It uses the NCBI E-utilities API to fetch data in MEDLINE format and saves each year's data in a separate text file within a structured directory. + +Features: +- Fetches PubMed records using a combination of the topic and year to form a query. +- Retrieves data in MEDLINE format, which includes structured bibliographic information. +- Saves the fetched data into text files, organizing them by topic and year under the './results/baseline_doc' directory. +- Handles network and API request errors by implementing retry logic with exponential backoff. + +Usage: +- The user must provide an NCBI API key and email for using NCBI's E-utilities. +- Modify the 'topic' variable and the year range in the script to fetch records for different topics or years. + +Dependencies: +- BioPython for interacting with NCBI's E-utilities. +- requests for making HTTP requests. + +Example: +To use the script, simply run it in a Python environment with the necessary dependencies installed. Ensure that the API key and email are correctly set up in the script. +""" import requests from Bio import Entrez from io import StringIO from Bio import Medline from io import BytesIO +import time +import os class PubMedDownloader: def __init__(self, api_key, email): @@ -11,36 +37,47 @@ def __init__(self, api_key, email): self.email = email Entrez.email = email # Setting email for Biopython Entrez - def fetch_pubmed_data(self, query, batch_size=10000): - search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" - search_response = requests.get(search_url) - if search_response.status_code == 200: - try: - # Use BytesIO for binary data - search_results = Entrez.read(BytesIO(search_response.content)) - webenv = search_results['WebEnv'] - query_key = search_results['QueryKey'] - count = int(search_results['Count']) - print(f"Total records found: {count}") - except Exception as e: - print("Error reading search results:", e) - return [] - else: - print("Failed to retrieve search results") - return [] - + def fetch_pubmed_data(self, query, year, max_records_per_query=9999): records = [] - for start in range(0, count, batch_size): - fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" - fetch_response = requests.get(fetch_url) - if fetch_response.status_code == 200: - records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines - print(f"Fetched {start + batch_size} of {count} records") - else: - print(f"Failed to fetch data for batch starting at {start}") + attempt = 0 + max_attempts = 5 + while attempt < max_attempts: + try: + search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" + search_response = requests.get(search_url, timeout=10) + if search_response.status_code == 200: + search_results = Entrez.read(BytesIO(search_response.content)) + webenv = search_results['WebEnv'] + query_key = search_results['QueryKey'] + count = int(search_results['Count']) + print(f"Total records found for the query '{query}': {count}") + + for start in range(0, min(count, max_records_per_query), max_records_per_query): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url, timeout=10) + records.append(fetch_response.text) + print(f"Fetched records starting from {start}") + break + else: + print(f"Failed to initiate search with status {search_response.status_code}") + except requests.exceptions.RequestException as e: + attempt += 1 + print(f"Attempt {attempt}: An error occurred: {e}") + time.sleep(2 ** attempt) # Exponential backoff + # Save records to a file + self.save_records_to_file(query, year, records) return records + def save_records_to_file(self, query, year, records): + directory = f"./results/baseline_doc" + os.makedirs(directory, exist_ok=True) # Create directory if it doesn't exist + filename = f"{query}.{year}.txt" + file_path = os.path.join(directory, filename) + with open(file_path, 'w', encoding='utf-8') as file: + file.write("\n".join(records)) # Each record is separated by a newline + print(f"Saved records to {file_path}") + class ids_pubmed(): def __init__(self): self.snp_ids = [] @@ -104,4 +141,7 @@ def search_ids(self, search_email): downloader = PubMedDownloader(api_key, email) topic = "zinc" # Define the topic of interest -pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed \ No newline at end of file +# Fetch and save records by year +for year in range(1990, 2023): # Example range of years + year_query = f"{topic} AND {year}[Date]" + downloader.fetch_pubmed_data(year_query, year) \ No newline at end of file diff --git a/pubmed_data.txt b/pubmed_data.txt deleted file mode 100644 index e69de29..0000000 diff --git a/results/baseline_doc/pubmed.zinc.0.full.txt b/results/baseline_doc/pubmed.zinc.0.full.txt deleted file mode 100644 index e69de29..0000000 diff --git a/results/baseline_doc/pubmed.zinc.2.15.txt b/results/baseline_doc/pubmed.zinc.2.15.txt deleted file mode 100644 index e69de29..0000000