diff --git a/.gitignore b/.gitignore index 9f9b25a..8dc7703 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ results/baseline_doc/pubmed.zinc.0.full.txt results/baseline_doc/pubmed.zinc.2.15.txt results/baseline_doc/zinc AND 2013\[Date\].2013.txt /results +config.ini diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py index e01710c..863cfc6 100644 --- a/code/step1_data_collection.py +++ b/code/step1_data_collection.py @@ -52,7 +52,7 @@ ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up g2d_starting_point = 0 -batch_size = 100 +batch_size = 1000 ############################# ##################### gene_end_point = round(query_size/batch_size) diff --git a/code/step1_data_collection_Custom_Luis.py b/code/step1_data_collection_Custom_Luis.py index 6f86893..5f61509 100644 --- a/code/step1_data_collection_Custom_Luis.py +++ b/code/step1_data_collection_Custom_Luis.py @@ -1,76 +1,151 @@ -# -*- coding: utf-8 -*- -""" -Created on Sun Jun 21 00:16:25 2020 -python -u "/mnt/c/Users/lrm22005/OneDrive - University of Connecticut/Research/ZIP11_Bioinformatic/capsule-3642152/code/step1_data_collection.py" 'zinc' 0 0 './results/zinc' -""" - import os -import pathlib -import sys import time -import urllib.error - -sys.path.append('lib') -from lib.Literature_Data_Collection import literature_data_collection - -if len(sys.argv) > 3: - word_query = str(sys.argv[1]) - word_end_point = int(sys.argv[2]) # the endpoint of a word-based data collection. for demo-b 100000 - gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection for demo-b 50 - paths = str(sys.argv[4]) + '/' -elif len(sys.argv) == 3: - word_query = str(sys.argv[1]) - paths = str(sys.argv[2]) + '/' - -data_dir = os.path.abspath(os.getcwd()) -output_dir = os.path.join(data_dir, paths + 'baseline_doc') -document_output_dir = os.path.join(data_dir, paths + 'gene2document') -pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) -pathlib.Path(document_output_dir).mkdir(parents=True, exist_ok=True) - -email = "lrmercadod@gmail.com" # Replace with your valid email address -api_key = "19bea34a4dbdbc6ef30392cee15943365309" -ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key) - -# setting up -########### word query based literature data collection ################# -gap = 1000 -batch = 200 -w2d_starting_point = 0 +import shutil +import logging +import requests +from Bio import Entrez +from io import BytesIO +import configparser + +# Ensure the current working directory is correct +print("Current working directory:", os.getcwd()) +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +class PubMedDownloader: + def __init__(self, api_key, email, max_records_per_query=9999): + self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + self.api_key = api_key + self.email = email + self.max_records_per_query = max_records_per_query + Entrez.email = email # Set email for NCBI E-utilities + + def fetch_pubmed_data(self, query, year): + normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "") + records = [] + attempt = 0 + max_attempts = 5 + + while attempt < max_attempts: + try: + search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" + search_response = requests.get(search_url, timeout=10) + if search_response.status_code == 200: + search_results = Entrez.read(BytesIO(search_response.content)) + webenv = search_results['WebEnv'] + query_key = search_results['QueryKey'] + count = int(search_results['Count']) + logging.info(f"Total records found for the query '{query}': {count}") + + if count > 0: + for start in range(0, min(count, self.max_records_per_query), self.max_records_per_query): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url, timeout=10) + records.append(fetch_response.text) + logging.info(f"Fetched records starting from {start}") + return self.save_records_to_file(normalized_query, year, records) + else: + logging.info(f"No records found for the query '{query}'") + return [] + break + except requests.exceptions.RequestException as e: + attempt += 1 + logging.error(f"Attempt {attempt}: An error occurred: {e}") + time.sleep(2 ** attempt) + return [] + + def save_records_to_file(self, query, year, records): + directory = os.path.join(".", "results", "baseline_doc") + os.makedirs(directory, exist_ok=True) + filename = f"{query}.{year}.txt" + file_path = os.path.join(directory, filename) + with open(file_path, 'w', encoding='utf-8') as file: + file.write("\n".join(records)) + logging.info(f"Saved records to {file_path}") + return file_path + + def consolidate_files(self, query): + directory = os.path.join(".", "results", "baseline_doc") + normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "") + consolidated_file_path = os.path.join(directory, f"{normalized_query}_consolidated.txt") + + # Check if there are any files to consolidate + if not os.listdir(directory): + logging.info("No files found in the directory to consolidate.") + return + + # Opening the consolidated file outside the loop to write all contents + with open(consolidated_file_path, 'w', encoding='utf-8') as outfile: + # Loop over each file in the directory + for fname in os.listdir(directory): + if fname.startswith(normalized_query) and fname.endswith(".txt") and 'consolidated' not in fname: + filepath = os.path.join(directory, fname) + # Ensure the file is not the consolidated file itself + if filepath != consolidated_file_path: + # Open, read, and close the file + with open(filepath, 'r', encoding='utf-8') as infile: + content = infile.read() + outfile.write(content + "\n") + logging.info(f"Added content from {fname} to the consolidated file.") + + # Remove the individual file after its content has been written + try: + os.remove(filepath) + logging.info(f"Removed file {fname} after consolidation.") + except OSError as e: + logging.error(f"Error occurred while removing file {fname}: {e}") + + logging.info(f"Consolidated records into {consolidated_file_path}") + + # Optional: Clean up the directory if empty + if not os.listdir(directory): + shutil.rmtree(directory) + logging.info("Removed empty directory after consolidation.") + +# # Read API key and email from the configuration file +# config = configparser.ConfigParser() +# config.read('config.ini') +# api_key = config.get('pubmed', 'api_key') +# email = config.get('pubmed', 'email') + +# Ensure the current working directory is correct +print("Current working directory:", os.getcwd()) + +config = configparser.ConfigParser() +config_path = 'config.ini' # Make sure this path is correct + +# Check if the config file exists to rule out path issues +if not os.path.exists(config_path): + print(f"Configuration file not found at {config_path}") +else: + print(f"Configuration file found at {config_path}") try: - search_results, _word_end_point = ld.word_based_query_fit(year=None, user_term=word_query) - print('The number of available abstracts:', _word_end_point, 'for', word_query) - - if int(sys.argv[2]) == 0: - word_end_point = _word_end_point - - ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point, - ixs=w2d_starting_point, test_end_point=word_end_point) -except urllib.error.HTTPError as e: - print(f"An HTTP error occurred: {e}") - print("Retrying in 5 seconds...") - time.sleep(5) - # Retry the request or handle the error appropriately - -########### gene name-query based literature data collection ################# -query_full = ld.text_open('./data/gene_name_info/query_full_name.txt') -query_symbol = ld.text_open('./data/gene_name_info/query_symbol.txt') -# gene name list -query_size = len(query_full) -ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up - -g2d_starting_point = 0 -batch_size = 10 - -############################ -gene_end_point = round(query_size / batch_size) - -if len(sys.argv) > 2: - gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection - -if int(sys.argv[3]) == 0: - gene_end_point = round(query_size / batch_size) - -ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point, - query_len=len(query_full), end_point=gene_end_point) \ No newline at end of file + config.read(config_path) + # Explicitly list sections and keys + print("Sections available:", config.sections()) + # Attempt to read the API key and email from the 'DEFAULT' section + api_key = config.get('DEFAULT', 'api_key') + email = config.get('DEFAULT', 'email') +except configparser.NoSectionError as e: + print(f"Missing section in your configuration file: {e}") +except configparser.NoOptionError as e: + print(f"Missing option in your configuration file: {e}") +except Exception as e: + print(f"An error occurred while reading the configuration file: {e}") + +# Create an instance of PubMedDownloader +downloader = PubMedDownloader(api_key, email) + +# Define the topic and year range +topic = "gene expression" +start_year = 1990 +end_year = 2024 + +# Fetch and save records by year, then consolidate and clean up +for year in range(start_year, end_year + 1): + year_query = f"{topic} AND {year}[Date]" + downloader.fetch_pubmed_data(year_query, year) + +# Consolidate all files into one +downloader.consolidate_files(topic) \ No newline at end of file diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py index eb607da..ca511f1 100644 --- a/code/step_1_data_collection_Luis.py +++ b/code/step_1_data_collection_Luis.py @@ -24,20 +24,20 @@ """ import requests from Bio import Entrez -from io import StringIO -from Bio import Medline from io import BytesIO import time import os +import shutil class PubMedDownloader: def __init__(self, api_key, email): self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" self.api_key = api_key self.email = email - Entrez.email = email # Setting email for Biopython Entrez + Entrez.email = email # Set email for NCBI E-utilities def fetch_pubmed_data(self, query, year, max_records_per_query=9999): + normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "") records = [] attempt = 0 max_attempts = 5 @@ -53,95 +53,75 @@ def fetch_pubmed_data(self, query, year, max_records_per_query=9999): count = int(search_results['Count']) print(f"Total records found for the query '{query}': {count}") - for start in range(0, min(count, max_records_per_query), max_records_per_query): - fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" - fetch_response = requests.get(fetch_url, timeout=10) - records.append(fetch_response.text) - print(f"Fetched records starting from {start}") - break - else: - print(f"Failed to initiate search with status {search_response.status_code}") + if count > 0: + for start in range(0, min(count, max_records_per_query), max_records_per_query): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url, timeout=10) + records.append(fetch_response.text) + print(f"Fetched records starting from {start}") + return self.save_records_to_file(normalized_query, year, records) + break except requests.exceptions.RequestException as e: attempt += 1 print(f"Attempt {attempt}: An error occurred: {e}") - time.sleep(2 ** attempt) # Exponential backoff - # Save records to a file - self.save_records_to_file(query, year, records) - return records + time.sleep(2 ** attempt) + return None def save_records_to_file(self, query, year, records): - directory = f"./results/baseline_doc" - os.makedirs(directory, exist_ok=True) # Create directory if it doesn't exist + directory = "./results/baseline_doc/" + os.makedirs(directory, exist_ok=True) filename = f"{query}.{year}.txt" file_path = os.path.join(directory, filename) with open(file_path, 'w', encoding='utf-8') as file: - file.write("\n".join(records)) # Each record is separated by a newline + file.write("\n".join(records)) print(f"Saved records to {file_path}") + return file_path -class ids_pubmed(): - def __init__(self): - self.snp_ids = [] - self.uids = [] - self.gene_names = [] - self.names = [] - self.records = [] - self.gene_full_names = [] - self.saved_snp_id = [] - - def search_ids(self, search_email): - removal_index = [] - Entrez.email = search_email - records = [] - for snp_id in self.snp_ids: - record = Entrez.read(Entrez.elink(dbfrom="snp", - id=snp_id.replace('rs', ''), - db="gene")) - if record[0]['LinkSetDb'] == []: - removal_index.append(snp_id) - print("index is removed: ", snp_id) - - else: - results = record[0]['LinkSetDb'][0]['Link'] - multi_gene = [] - multi_full_name = [] - multi_uid = [] - for result in results: - uid = result['Id'] - handle = Entrez.esummary(db="gene", id=uid) - uid_record = Entrez.read(handle) - - records.append(uid_record) - handle.close() - uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0] - gene_name = uid_summary['Name'] - gene_full_name = uid_summary['Description'] - if len(results) > 1: - multi_gene.append(gene_name) - multi_full_name.append(gene_full_name) - multi_uid.append(uid) - else: - multi_gene = gene_name - multi_full_name = gene_full_name - multi_uid = uid - - if len(results) > 1: - multi_uid = "#".join(multi_uid) - multi_gene = "#".join(multi_gene) - multi_full_name = "#".join(multi_full_name) - - self.uids.append(multi_uid) - self.gene_names.append(multi_gene) - self.gene_full_names.append(multi_full_name) - self.saved_snp_id.append(snp_id) - return removal_index, records, self.uids, self.gene_names, self.gene_full_names - -# Example usage: -api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key -email = "lrmercadod@gmail.com" # Replace with your email + def consolidate_files(self, query): + directory = "./results/baseline_doc/" + normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "") + consolidated_file_path = f"./results/baseline_doc/{normalized_query}_consolidated.txt" + + # Check if there are any files to consolidate + if not os.listdir(directory): + print("No files found in the directory to consolidate.") + return + + # Opening the consolidated file outside the loop to write all contents + with open(consolidated_file_path, 'w', encoding='utf-8') as outfile: + # Loop over each file in the directory + for fname in os.listdir(directory): + if fname.startswith(normalized_query) and fname.endswith(".txt") and 'consolidated' not in fname: + filepath = os.path.join(directory, fname) + # Ensure the file is not the consolidated file itself + if filepath != consolidated_file_path: + # Open, read, and close the file + with open(filepath, 'r', encoding='utf-8') as infile: + content = infile.read() + outfile.write(content + "\n") + print(f"Added content from {fname} to the consolidated file.") + + # Remove the individual file after its content has been written + os.remove(filepath) + print(f"Removed file {fname} after consolidation.") + print(f"Consolidated records into {consolidated_file_path}") + + # Optional: Clean up the directory if empty + if not os.listdir(directory): + shutil.rmtree(directory) + print("Removed empty directory after consolidation.") + +api_key = "19bea34a4dbdbc6ef30392cee15943365309" +email = "lrmercadod@gmail.com" downloader = PubMedDownloader(api_key, email) -topic = "zinc" # Define the topic of interest -# Fetch and save records by year -for year in range(1990, 2023): # Example range of years +topic = "zinc" + +# Fetch and save records by year, then consolidate and clean up +for year in range(1990, 2025): year_query = f"{topic} AND {year}[Date]" - downloader.fetch_pubmed_data(year_query, year) \ No newline at end of file + if not downloader.fetch_pubmed_data(year_query, year): + print(f"No data found or failed to fetch for {year_query}") + +# Consolidate all files into one +downloader.consolidate_files(topic) \ No newline at end of file diff --git a/code/step_1_data_collection_Luis_.py b/code/step_1_data_collection_Luis_.py index 3c2fa52..9768f7d 100644 --- a/code/step_1_data_collection_Luis_.py +++ b/code/step_1_data_collection_Luis_.py @@ -1,43 +1,81 @@ import requests +from Bio import Entrez +from io import BytesIO import time +import os +import shutil -def fetch_pubmed_data(query, max_results=1000000): - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key - search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax=10000&api_key={api_key}" - fetch_url = f"{base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&api_key={api_key}" - - # Perform the initial search - search_response = requests.get(search_url) - if search_response.status_code != 200: - print("Failed to retrieve data") - return - - search_results = search_response.text - id_list = search_results.split('')[1].split('')[0] - id_list = id_list.strip().split() - - print(f"Found {len(id_list)} records, fetching data...") - - # Fetch details of all IDs - records = [] - for start in range(0, len(id_list), 500): # PubMed allows fetching up to 500 records at a time - end = min(start + 500, len(id_list)) - ids = ','.join(id_list[start:end]) - fetch_response = requests.get(f"{fetch_url}&id={ids}") - if fetch_response.status_code == 200: - records.append(fetch_response.text) - else: - print("Failed to fetch data for some records.") - time.sleep(0.5) # to prevent hitting rate limit - - return records - -# Example usage -topic = "zinc" -downloaded_data = fetch_pubmed_data(topic) - -# Optionally, save the data to a file -with open("pubmed_data.txt", "w") as file: - for record in downloaded_data: - file.write(record) +class PubMedDownloader: + def __init__(self, api_key, email): + self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + self.api_key = api_key + self.email = email + Entrez.email = email # Set email for NCBI E-utilities + + def fetch_pubmed_data(self, query, year, max_records_per_query=9999): + normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "") + records = [] + attempt = 0 + max_attempts = 5 + + while attempt < max_attempts: + try: + search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query} AND open access[filter]&retmax=1&api_key={self.api_key}&usehistory=y" + search_response = requests.get(search_url, timeout=10) + if search_response.status_code == 200: + search_results = Entrez.read(BytesIO(search_response.content)) + webenv = search_results['WebEnv'] + query_key = search_results['QueryKey'] + count = int(search_results['Count']) + print(f"Total records found for the query '{query}': {count}") + + if count > 0: + for start in range(0, min(count, max_records_per_query), max_records_per_query): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url, timeout=10) + records.append(fetch_response.text) + print(f"Fetched records starting from {start}") + return self.save_records_to_file(normalized_query, year, records) + break + except requests.exceptions.RequestException as e: + attempt += 1 + print(f"Attempt {attempt}: An error occurred: {e}") + time.sleep(2 ** attempt) + return None + + def save_records_to_file(self, query, year, records): + directory = "./results/baseline_doc/" + os.makedirs(directory, exist_ok=True) + filename = f"{query}.{year}.txt" + file_path = os.path.join(directory, filename) + with open(file_path, 'w', encoding='utf-8') as file: + file.write("\n".join(records)) + print(f"Saved records to {file_path}") + return file_path + + def consolidate_files(self, query): + directory = "./results/baseline_doc/" + consolidated_file_path = f"./results/baseline_doc/{query}_consolidated.txt" + with open(consolidated_file_path, 'w', encoding='utf-8') as outfile: + for fname in os.listdir(directory): + if fname.startswith(query) and fname.endswith(".txt"): + filepath = os.path.join(directory, fname) + with open(filepath, 'r', encoding='utf-8') as infile: + outfile.write(infile.read() + "\n") + os.remove(filepath) # Remove the file after consolidating + print(f"Consolidated records into {consolidated_file_path}") + +# Usage example: +api_key = "19bea34a4dbdbc6ef30392cee15943365309" +email = "lrmercadod@gmail.com" +downloader = PubMedDownloader(api_key, email) +topic = "zinc" + +# Fetch and save records by year, then consolidate and clean up +for year in range(1990, 2025): + year_query = f"{topic} AND {year}[Date]" + if not downloader.fetch_pubmed_data(year_query, year): + print(f"No data found or failed to fetch for {year_query}") + +# Consolidate all files into one +downloader.consolidate_files(topic)