diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py index a18473b..5945465 100644 --- a/code/step_1_data_collection_Luis.py +++ b/code/step_1_data_collection_Luis.py @@ -3,6 +3,8 @@ from io import StringIO from Bio import Medline from io import BytesIO +import time +import os class PubMedDownloader: def __init__(self, api_key, email): @@ -11,36 +13,47 @@ def __init__(self, api_key, email): self.email = email Entrez.email = email # Setting email for Biopython Entrez - def fetch_pubmed_data(self, query, batch_size=10000): - search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" - search_response = requests.get(search_url) - if search_response.status_code == 200: - try: - # Use BytesIO for binary data - search_results = Entrez.read(BytesIO(search_response.content)) - webenv = search_results['WebEnv'] - query_key = search_results['QueryKey'] - count = int(search_results['Count']) - print(f"Total records found: {count}") - except Exception as e: - print("Error reading search results:", e) - return [] - else: - print("Failed to retrieve search results") - return [] - + def fetch_pubmed_data(self, query, year, max_records_per_query=9999): records = [] - for start in range(0, count, batch_size): - fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" - fetch_response = requests.get(fetch_url) - if fetch_response.status_code == 200: - records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines - print(f"Fetched {start + batch_size} of {count} records") - else: - print(f"Failed to fetch data for batch starting at {start}") + attempt = 0 + max_attempts = 5 + while attempt < max_attempts: + try: + search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" + search_response = requests.get(search_url, timeout=10) + if search_response.status_code == 200: + search_results = Entrez.read(BytesIO(search_response.content)) + webenv = search_results['WebEnv'] + query_key = search_results['QueryKey'] + count = int(search_results['Count']) + print(f"Total records found for the query '{query}': {count}") + + for start in range(0, min(count, max_records_per_query), max_records_per_query): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url, timeout=10) + records.append(fetch_response.text) + print(f"Fetched records starting from {start}") + break + else: + print(f"Failed to initiate search with status {search_response.status_code}") + except requests.exceptions.RequestException as e: + attempt += 1 + print(f"Attempt {attempt}: An error occurred: {e}") + time.sleep(2 ** attempt) # Exponential backoff + # Save records to a file + self.save_records_to_file(query, year, records) return records + def save_records_to_file(self, query, year, records): + directory = f"./results/baseline_doc" + os.makedirs(directory, exist_ok=True) # Create directory if it doesn't exist + filename = f"{query}.{year}.txt" + file_path = os.path.join(directory, filename) + with open(file_path, 'w', encoding='utf-8') as file: + file.write("\n".join(records)) # Each record is separated by a newline + print(f"Saved records to {file_path}") + class ids_pubmed(): def __init__(self): self.snp_ids = [] @@ -104,4 +117,7 @@ def search_ids(self, search_email): downloader = PubMedDownloader(api_key, email) topic = "zinc" # Define the topic of interest -pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed \ No newline at end of file +# Fetch and save records by year +for year in range(1990, 2023): # Example range of years + year_query = f"{topic} AND {year}[Date]" + downloader.fetch_pubmed_data(year_query, year) \ No newline at end of file