From afd4e221257145f72ec8ada0e931f19202f2542b Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Mon, 6 May 2024 12:12:50 -0400 Subject: [PATCH 1/4] Found a method to fetch This is my implementation that is downloading and creating the files with all the information from a specific year. This method is a way to work. --- .gitignore | 32 +++++++++++++++++++++ results/baseline_doc/pubmed.zinc.0.full.txt | 0 results/baseline_doc/pubmed.zinc.2.15.txt | 0 3 files changed, 32 insertions(+) delete mode 100644 results/baseline_doc/pubmed.zinc.0.full.txt delete mode 100644 results/baseline_doc/pubmed.zinc.2.15.txt diff --git a/.gitignore b/.gitignore index e0116c1..9f9b25a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,35 @@ data/gene_name_info/query_snps.txt data/gene_name_info/query_symbol.txt results/baseline_doc/pubmed.zinc.0.15.txt results/baseline_doc/pubmed.zinc.1.15.txt +code/step_1_data_collection_Luis.py +results/baseline_doc/pubmed.zinc.0.full.txt +results/baseline_doc/pubmed.zinc.2.15.txt +results/baseline_doc/zinc AND 1990\[Date\].1990.txt +results/baseline_doc/zinc AND 1991\[Date\].1991.txt +results/baseline_doc/zinc AND 1992\[Date\].1992.txt +results/baseline_doc/zinc AND 1994\[Date\].1994.txt +results/baseline_doc/zinc AND 1993\[Date\].1993.txt +results/baseline_doc/zinc AND 1995\[Date\].1995.txt +results/baseline_doc/zinc AND 1996\[Date\].1996.txt +results/baseline_doc/zinc AND 1997\[Date\].1997.txt +results/baseline_doc/zinc AND 1998\[Date\].1998.txt +results/baseline_doc/zinc AND 1999\[Date\].1999.txt +results/baseline_doc/zinc AND 2000\[Date\].2000.txt +results/baseline_doc/zinc AND 2001\[Date\].2001.txt +results/baseline_doc/zinc AND 2002\[Date\].2002.txt +results/baseline_doc/zinc AND 2003\[Date\].2003.txt +results/baseline_doc/zinc AND 2004\[Date\].2004.txt +results/baseline_doc/zinc AND 2005\[Date\].2005.txt +results/baseline_doc/zinc AND 2006\[Date\].2006.txt +results/baseline_doc/zinc AND 2007\[Date\].2007.txt +results/baseline_doc/zinc AND 2009\[Date\].2009.txt +results/baseline_doc/zinc AND 2008\[Date\].2008.txt +results/baseline_doc/zinc AND 2010\[Date\].2010.txt +results/baseline_doc/zinc AND 2011\[Date\].2011.txt +results/baseline_doc/pubmed.zinc.0.full.txt +results/baseline_doc/pubmed.zinc.2.15.txt +results/baseline_doc/zinc AND 2012\[Date\].2012.txt +results/baseline_doc/pubmed.zinc.0.full.txt +results/baseline_doc/pubmed.zinc.2.15.txt +results/baseline_doc/zinc AND 2013\[Date\].2013.txt +/results diff --git a/results/baseline_doc/pubmed.zinc.0.full.txt b/results/baseline_doc/pubmed.zinc.0.full.txt deleted file mode 100644 index e69de29..0000000 diff --git a/results/baseline_doc/pubmed.zinc.2.15.txt b/results/baseline_doc/pubmed.zinc.2.15.txt deleted file mode 100644 index e69de29..0000000 From 510e5e21cc3ffedc3633af3f22e9bcd94604d04a Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Mon, 6 May 2024 12:27:48 -0400 Subject: [PATCH 2/4] step_1_data_collection_Luis.py Enhance PubMed Data Fetching and Saving Mechanism This commit introduces several enhancements to the PubMedDownloader class, improving its functionality and usability: 1. **Dynamic Year Querying**: Added support for dynamic querying by year. This allows users to specify a range of years for which the PubMed records should be fetched. 2. **Structured Data Saving**: Implemented functionality to save the fetched PubMed records in MEDLINE format. Each year's data is saved in a separate text file, named according to the query and the year, facilitating easier data management and retrieval. 3. **Error Handling**: Enhanced error handling capabilities to manage network issues and API limitations more robustly. This includes retry mechanisms with exponential backoff and timeout settings to prevent hanging requests. 4. **Directory Management**: Automated directory creation for storing the output files, ensuring that the user does not need to manually create directories before running the script. These enhancements make the script more robust and user-friendly, suitable for handling large-scale data retrieval tasks in biomedical research environments. --- code/step_1_data_collection_Luis.py | 70 ++++++++++++++++++----------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py index a18473b..5945465 100644 --- a/code/step_1_data_collection_Luis.py +++ b/code/step_1_data_collection_Luis.py @@ -3,6 +3,8 @@ from io import StringIO from Bio import Medline from io import BytesIO +import time +import os class PubMedDownloader: def __init__(self, api_key, email): @@ -11,36 +13,47 @@ def __init__(self, api_key, email): self.email = email Entrez.email = email # Setting email for Biopython Entrez - def fetch_pubmed_data(self, query, batch_size=10000): - search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" - search_response = requests.get(search_url) - if search_response.status_code == 200: - try: - # Use BytesIO for binary data - search_results = Entrez.read(BytesIO(search_response.content)) - webenv = search_results['WebEnv'] - query_key = search_results['QueryKey'] - count = int(search_results['Count']) - print(f"Total records found: {count}") - except Exception as e: - print("Error reading search results:", e) - return [] - else: - print("Failed to retrieve search results") - return [] - + def fetch_pubmed_data(self, query, year, max_records_per_query=9999): records = [] - for start in range(0, count, batch_size): - fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" - fetch_response = requests.get(fetch_url) - if fetch_response.status_code == 200: - records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines - print(f"Fetched {start + batch_size} of {count} records") - else: - print(f"Failed to fetch data for batch starting at {start}") + attempt = 0 + max_attempts = 5 + while attempt < max_attempts: + try: + search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" + search_response = requests.get(search_url, timeout=10) + if search_response.status_code == 200: + search_results = Entrez.read(BytesIO(search_response.content)) + webenv = search_results['WebEnv'] + query_key = search_results['QueryKey'] + count = int(search_results['Count']) + print(f"Total records found for the query '{query}': {count}") + + for start in range(0, min(count, max_records_per_query), max_records_per_query): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url, timeout=10) + records.append(fetch_response.text) + print(f"Fetched records starting from {start}") + break + else: + print(f"Failed to initiate search with status {search_response.status_code}") + except requests.exceptions.RequestException as e: + attempt += 1 + print(f"Attempt {attempt}: An error occurred: {e}") + time.sleep(2 ** attempt) # Exponential backoff + # Save records to a file + self.save_records_to_file(query, year, records) return records + def save_records_to_file(self, query, year, records): + directory = f"./results/baseline_doc" + os.makedirs(directory, exist_ok=True) # Create directory if it doesn't exist + filename = f"{query}.{year}.txt" + file_path = os.path.join(directory, filename) + with open(file_path, 'w', encoding='utf-8') as file: + file.write("\n".join(records)) # Each record is separated by a newline + print(f"Saved records to {file_path}") + class ids_pubmed(): def __init__(self): self.snp_ids = [] @@ -104,4 +117,7 @@ def search_ids(self, search_email): downloader = PubMedDownloader(api_key, email) topic = "zinc" # Define the topic of interest -pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed \ No newline at end of file +# Fetch and save records by year +for year in range(1990, 2023): # Example range of years + year_query = f"{topic} AND {year}[Date]" + downloader.fetch_pubmed_data(year_query, year) \ No newline at end of file From 77d0ba9d54f7a55b3d13dfb4dfda0290a9622c78 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Mon, 6 May 2024 12:27:58 -0400 Subject: [PATCH 3/4] Delete pubmed_data.txt --- pubmed_data.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 pubmed_data.txt diff --git a/pubmed_data.txt b/pubmed_data.txt deleted file mode 100644 index e69de29..0000000 From fe9178672376327c88dd075da21ce030b3a598a2 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Mon, 6 May 2024 12:31:19 -0400 Subject: [PATCH 4/4] Add functionality to fetch and save PubMed records by year This commit introduces significant enhancements to the PubMedDownloader class, enabling the automated fetching of PubMed records by topic and year using the NCBI E-utilities API. Each set of records fetched for a specific year is now saved in a dedicated text file formatted in MEDLINE style, facilitating easier access and organization of the data. Key Changes: - Added file saving functionality that organizes records into files named by topic and year. - Implemented error handling with retry logic and exponential backoff to manage network and API errors more robustly. - Configured the fetch function to retrieve records in MEDLINE format, ensuring that the data is structured according to PubMed's bibliographic standards. The records are stored in the './results/baseline_doc' directory, with each file representing a specific year's data on the chosen topic. This update is crucial for researchers needing structured and easily accessible bibliographic information from PubMed. --- code/step_1_data_collection_Luis.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py index 5945465..eb607da 100644 --- a/code/step_1_data_collection_Luis.py +++ b/code/step_1_data_collection_Luis.py @@ -1,3 +1,27 @@ +""" +Code created by: lrmercadod +Date: 5/6/2024 10:43:45 +PubMed Record Fetcher and Saver + +This script is designed to automate the retrieval of PubMed records based on a specific topic and year. It uses the NCBI E-utilities API to fetch data in MEDLINE format and saves each year's data in a separate text file within a structured directory. + +Features: +- Fetches PubMed records using a combination of the topic and year to form a query. +- Retrieves data in MEDLINE format, which includes structured bibliographic information. +- Saves the fetched data into text files, organizing them by topic and year under the './results/baseline_doc' directory. +- Handles network and API request errors by implementing retry logic with exponential backoff. + +Usage: +- The user must provide an NCBI API key and email for using NCBI's E-utilities. +- Modify the 'topic' variable and the year range in the script to fetch records for different topics or years. + +Dependencies: +- BioPython for interacting with NCBI's E-utilities. +- requests for making HTTP requests. + +Example: +To use the script, simply run it in a Python environment with the necessary dependencies installed. Ensure that the API key and email are correctly set up in the script. +""" import requests from Bio import Entrez from io import StringIO