Skip to content

Commit

Permalink
Merge pull request #2 from lrm22005/Luis
Browse files Browse the repository at this point in the history
Luis
  • Loading branch information
lrm22005 authored May 6, 2024
2 parents e107ed9 + fe91786 commit 1655990
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 27 deletions.
32 changes: 32 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,35 @@ data/gene_name_info/query_snps.txt
data/gene_name_info/query_symbol.txt
results/baseline_doc/pubmed.zinc.0.15.txt
results/baseline_doc/pubmed.zinc.1.15.txt
code/step_1_data_collection_Luis.py
results/baseline_doc/pubmed.zinc.0.full.txt
results/baseline_doc/pubmed.zinc.2.15.txt
results/baseline_doc/zinc AND 1990\[Date\].1990.txt
results/baseline_doc/zinc AND 1991\[Date\].1991.txt
results/baseline_doc/zinc AND 1992\[Date\].1992.txt
results/baseline_doc/zinc AND 1994\[Date\].1994.txt
results/baseline_doc/zinc AND 1993\[Date\].1993.txt
results/baseline_doc/zinc AND 1995\[Date\].1995.txt
results/baseline_doc/zinc AND 1996\[Date\].1996.txt
results/baseline_doc/zinc AND 1997\[Date\].1997.txt
results/baseline_doc/zinc AND 1998\[Date\].1998.txt
results/baseline_doc/zinc AND 1999\[Date\].1999.txt
results/baseline_doc/zinc AND 2000\[Date\].2000.txt
results/baseline_doc/zinc AND 2001\[Date\].2001.txt
results/baseline_doc/zinc AND 2002\[Date\].2002.txt
results/baseline_doc/zinc AND 2003\[Date\].2003.txt
results/baseline_doc/zinc AND 2004\[Date\].2004.txt
results/baseline_doc/zinc AND 2005\[Date\].2005.txt
results/baseline_doc/zinc AND 2006\[Date\].2006.txt
results/baseline_doc/zinc AND 2007\[Date\].2007.txt
results/baseline_doc/zinc AND 2009\[Date\].2009.txt
results/baseline_doc/zinc AND 2008\[Date\].2008.txt
results/baseline_doc/zinc AND 2010\[Date\].2010.txt
results/baseline_doc/zinc AND 2011\[Date\].2011.txt
results/baseline_doc/pubmed.zinc.0.full.txt
results/baseline_doc/pubmed.zinc.2.15.txt
results/baseline_doc/zinc AND 2012\[Date\].2012.txt
results/baseline_doc/pubmed.zinc.0.full.txt
results/baseline_doc/pubmed.zinc.2.15.txt
results/baseline_doc/zinc AND 2013\[Date\].2013.txt
/results
94 changes: 67 additions & 27 deletions code/step_1_data_collection_Luis.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,34 @@
"""
Code created by: lrmercadod
Date: 5/6/2024 10:43:45
PubMed Record Fetcher and Saver
This script is designed to automate the retrieval of PubMed records based on a specific topic and year. It uses the NCBI E-utilities API to fetch data in MEDLINE format and saves each year's data in a separate text file within a structured directory.
Features:
- Fetches PubMed records using a combination of the topic and year to form a query.
- Retrieves data in MEDLINE format, which includes structured bibliographic information.
- Saves the fetched data into text files, organizing them by topic and year under the './results/baseline_doc' directory.
- Handles network and API request errors by implementing retry logic with exponential backoff.
Usage:
- The user must provide an NCBI API key and email for using NCBI's E-utilities.
- Modify the 'topic' variable and the year range in the script to fetch records for different topics or years.
Dependencies:
- BioPython for interacting with NCBI's E-utilities.
- requests for making HTTP requests.
Example:
To use the script, simply run it in a Python environment with the necessary dependencies installed. Ensure that the API key and email are correctly set up in the script.
"""
import requests
from Bio import Entrez
from io import StringIO
from Bio import Medline
from io import BytesIO
import time
import os

class PubMedDownloader:
def __init__(self, api_key, email):
Expand All @@ -11,36 +37,47 @@ def __init__(self, api_key, email):
self.email = email
Entrez.email = email # Setting email for Biopython Entrez

def fetch_pubmed_data(self, query, batch_size=10000):
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url)
if search_response.status_code == 200:
try:
# Use BytesIO for binary data
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
print(f"Total records found: {count}")
except Exception as e:
print("Error reading search results:", e)
return []
else:
print("Failed to retrieve search results")
return []

def fetch_pubmed_data(self, query, year, max_records_per_query=9999):
records = []
for start in range(0, count, batch_size):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url)
if fetch_response.status_code == 200:
records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines
print(f"Fetched {start + batch_size} of {count} records")
else:
print(f"Failed to fetch data for batch starting at {start}")
attempt = 0
max_attempts = 5

while attempt < max_attempts:
try:
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
print(f"Total records found for the query '{query}': {count}")

for start in range(0, min(count, max_records_per_query), max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
print(f"Fetched records starting from {start}")
break
else:
print(f"Failed to initiate search with status {search_response.status_code}")
except requests.exceptions.RequestException as e:
attempt += 1
print(f"Attempt {attempt}: An error occurred: {e}")
time.sleep(2 ** attempt) # Exponential backoff
# Save records to a file
self.save_records_to_file(query, year, records)
return records

def save_records_to_file(self, query, year, records):
directory = f"./results/baseline_doc"
os.makedirs(directory, exist_ok=True) # Create directory if it doesn't exist
filename = f"{query}.{year}.txt"
file_path = os.path.join(directory, filename)
with open(file_path, 'w', encoding='utf-8') as file:
file.write("\n".join(records)) # Each record is separated by a newline
print(f"Saved records to {file_path}")

class ids_pubmed():
def __init__(self):
self.snp_ids = []
Expand Down Expand Up @@ -104,4 +141,7 @@ def search_ids(self, search_email):

downloader = PubMedDownloader(api_key, email)
topic = "zinc" # Define the topic of interest
pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed
# Fetch and save records by year
for year in range(1990, 2023): # Example range of years
year_query = f"{topic} AND {year}[Date]"
downloader.fetch_pubmed_data(year_query, year)
Empty file removed pubmed_data.txt
Empty file.
Empty file.
Empty file.

0 comments on commit 1655990

Please sign in to comment.