From abfaa4d9c47513f519cdd5b8ec8f7f5d98915cb8 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Mon, 6 May 2024 10:34:37 -0400 Subject: [PATCH] Codes update Update with debugging changes over the step1_data_collection, step_1_data_collection_Luis and step_1_data_collection_Luis_ looking for a way to download more than 10000 documents. --- .../Loading_PudMed.cpython-311.pyc | Bin 11086 -> 11092 bytes code/step1_data_collection.py | 6 +- code/step_1_data_collection_Luis.py | 223 ++++++++---------- code/step_1_data_collection_Luis_.py | 62 +++-- pubmed_data.txt | 0 results/baseline_doc/pubmed.zinc.0.full.txt | 0 results/baseline_doc/pubmed.zinc.2.15.txt | 0 7 files changed, 149 insertions(+), 142 deletions(-) create mode 100644 pubmed_data.txt create mode 100644 results/baseline_doc/pubmed.zinc.0.full.txt create mode 100644 results/baseline_doc/pubmed.zinc.2.15.txt diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc index 2d733ca743877be94e03e9a6d0921546d1e83fa6..4786d65e0c0caaf3989365c9d91a0a9dc257b7b3 100644 GIT binary patch delta 36 qcmX>Xb|s8!IWI340}xoh-pF;EiLqt!Go}>QD9->x!_A(|tQr8-PzuZd delta 30 kcmcZ-b}o!-IWI340}$lA*vNI8iLq|-Gp3Zy0n98K0GnP3pa1{> diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py index 16c75e8..35538d6 100644 --- a/code/step1_data_collection.py +++ b/code/step1_data_collection.py @@ -33,9 +33,9 @@ ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key) ########### word query based literature data collection ################# -gap=10000 -batch = 1000 -w2d_starting_point = 0 +gap=9000 +batch = 400 +w2d_starting_point = 2 search_results, _word_end_point = ld.word_based_query_fit(year = years, user_term=word_query) print('The number of avaliable abstracts :', _word_end_point, 'for ', word_query) diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py index 0ab1f4e..a18473b 100644 --- a/code/step_1_data_collection_Luis.py +++ b/code/step_1_data_collection_Luis.py @@ -1,122 +1,107 @@ -# -*- coding: utf-8 -*- -""" -Created on Sun Jun 21 00:16:25 2020 -Updated to include robust retry mechanism and API rate limiting -""" - -import os -import pathlib -import sys -import time -import urllib.error - -# Ensuring the correct append path for 'lib' -sys.path.append(os.path.join(os.path.abspath(os.getcwd()), 'lib')) -from lib.Loading_PudMed import ids_pudmed as pudmed - -class literature_data_collection: - def __init__(self, email, output_dir, document_output_dir, api_key=None): - self.output_dir = output_dir - self.document_output_dir = document_output_dir - self.email = email +import requests +from Bio import Entrez +from io import StringIO +from Bio import Medline +from io import BytesIO + +class PubMedDownloader: + def __init__(self, api_key, email): + self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" self.api_key = api_key - print("Initialized literature_data_collection with email: {}".format(email)) - - def text_open(self, path): - with open(path, 'r') as f: - data = f.read().strip().split('\n') - return data - - def word_based_query_fit(self, year=None, user_term="heart"): - pud = pudmed() - print("Created pudmed instance for searching.") - search_results, end_point = pud.search_list(user_term, year, self.email) - return search_results, end_point - - def collecting_doc_using_word_based_query(self, year=None, user_term="heart", gap=50000, starting=0, ixs=0, test_end_point=0): - pud = pudmed() - print("Collecting documents using word-based query.") - search_results, end_point = pud.search_list(user_term, year, self.email) - if test_end_point != 0: - end_point = test_end_point - print('Checking data collection performance --- collecting until', end_point, 'documents') - next_start = starting - for ix in range(ixs, round(end_point/gap) + 1): - next_start = self.robust_request(ix, gap, next_start, end_point, 10000, pud, search_results) - if next_start >= end_point: - break + self.email = email + Entrez.email = email # Setting email for Biopython Entrez - def robust_request(self, ix, gap, starting, end_point, batch, pud, search_results): - success = False - attempts = 0 - while not success and attempts < 5: + def fetch_pubmed_data(self, query, batch_size=10000): + search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" + search_response = requests.get(search_url) + if search_response.status_code == 200: try: - print(f"{ix} / {end_point // gap} | from {starting} to {min(starting + gap, end_point)}") - pud.search_full(ix, self.output_dir, search_results, starting, min(starting + gap, end_point), batch) - success = True - except urllib.error.HTTPError as e: - attempts += 1 - wait_time = 2 ** attempts - print(f"An HTTP error occurred: {e}") - print(f"Retrying in {wait_time} seconds...") - time.sleep(wait_time) - - if not success: - print("Failed after 5 attempts, skipping this batch.") - return starting + gap # Returns the next starting point - -if __name__ == "__main__": - if len(sys.argv) > 3: - word_query = str(sys.argv[1]) - word_end_point = int(sys.argv[2]) - gene_end_point = int(sys.argv[3]) - paths = str(sys.argv[4]) + '/' - elif len(sys.argv) == 3: - word_query = str(sys.argv[1]) - paths = str(sys.argv[2]) + '/' - - data_dir = os.path.abspath(os.getcwd()) - output_dir = os.path.join(data_dir, paths + 'baseline_doc') - document_output_dir = os.path.join(data_dir, paths + 'gene2document') - os.makedirs(output_dir, exist_ok=True) - os.makedirs(document_output_dir, exist_ok=True) - - email = "lrmercadod@gmail.com" # Replace with your valid email address - api_key = "19bea34a4dbdbc6ef30392cee15943365309" - ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key) - - gap = 50000 # Adjust as needed - batch = 10000 # Adjust as needed - w2d_starting_point = 0 # Adjust if resuming from a different point - - try: - search_results, word_end_point = ld.word_based_query_fit(year=None, user_term=word_query) - print('The number of available abstracts:', word_end_point, 'for', word_query) - - if int(sys.argv[2]) == 0: - word_end_point = word_end_point - - ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point, ixs=w2d_starting_point, test_end_point=word_end_point) - except urllib.error.HTTPError as e: - print(f"An HTTP error occurred: {e}") - print("Retrying in 5 seconds...") - time.sleep(5) - - # Assuming gene data is prepared and ready to be processed - try: - query_full = ld.text_open('data/gene_name_info/query_full_name.txt') # Adjust path as necessary - query_symbol = ld.text_open('data/gene_name_info/query_symbol.txt') # Adjust path as necessary - query_size = len(query_full) - ld.gene_based_query_fit(query_size, query_full, query_symbol) - - g2d_starting_point = 0 - batch_size = 10 - gene_end_point = round(query_size / batch_size) - if len(sys.argv) > 2: - gene_end_point = int(sys.argv[3]) - if int(sys.argv[3]) == 0: - gene_end_point = round(query_size / batch_size) - - ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point, query_len=query_size, end_point=gene_end_point) - except Exception as e: - print(f"Error during gene-based data collection: {e}") + # Use BytesIO for binary data + search_results = Entrez.read(BytesIO(search_response.content)) + webenv = search_results['WebEnv'] + query_key = search_results['QueryKey'] + count = int(search_results['Count']) + print(f"Total records found: {count}") + except Exception as e: + print("Error reading search results:", e) + return [] + else: + print("Failed to retrieve search results") + return [] + + records = [] + for start in range(0, count, batch_size): + fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" + fetch_response = requests.get(fetch_url) + if fetch_response.status_code == 200: + records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines + print(f"Fetched {start + batch_size} of {count} records") + else: + print(f"Failed to fetch data for batch starting at {start}") + + return records + +class ids_pubmed(): + def __init__(self): + self.snp_ids = [] + self.uids = [] + self.gene_names = [] + self.names = [] + self.records = [] + self.gene_full_names = [] + self.saved_snp_id = [] + + def search_ids(self, search_email): + removal_index = [] + Entrez.email = search_email + records = [] + for snp_id in self.snp_ids: + record = Entrez.read(Entrez.elink(dbfrom="snp", + id=snp_id.replace('rs', ''), + db="gene")) + if record[0]['LinkSetDb'] == []: + removal_index.append(snp_id) + print("index is removed: ", snp_id) + + else: + results = record[0]['LinkSetDb'][0]['Link'] + multi_gene = [] + multi_full_name = [] + multi_uid = [] + for result in results: + uid = result['Id'] + handle = Entrez.esummary(db="gene", id=uid) + uid_record = Entrez.read(handle) + + records.append(uid_record) + handle.close() + uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0] + gene_name = uid_summary['Name'] + gene_full_name = uid_summary['Description'] + if len(results) > 1: + multi_gene.append(gene_name) + multi_full_name.append(gene_full_name) + multi_uid.append(uid) + else: + multi_gene = gene_name + multi_full_name = gene_full_name + multi_uid = uid + + if len(results) > 1: + multi_uid = "#".join(multi_uid) + multi_gene = "#".join(multi_gene) + multi_full_name = "#".join(multi_full_name) + + self.uids.append(multi_uid) + self.gene_names.append(multi_gene) + self.gene_full_names.append(multi_full_name) + self.saved_snp_id.append(snp_id) + return removal_index, records, self.uids, self.gene_names, self.gene_full_names + +# Example usage: +api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key +email = "lrmercadod@gmail.com" # Replace with your email + +downloader = PubMedDownloader(api_key, email) +topic = "zinc" # Define the topic of interest +pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed \ No newline at end of file diff --git a/code/step_1_data_collection_Luis_.py b/code/step_1_data_collection_Luis_.py index 4e313c0..3c2fa52 100644 --- a/code/step_1_data_collection_Luis_.py +++ b/code/step_1_data_collection_Luis_.py @@ -1,21 +1,43 @@ -from Bio import Entrez +import requests import time -def download_data(query, batch_size=1000, delay=1): - Entrez.email = "your.email@example.com" - handle = Entrez.esearch(db="pubmed", term=query, retmax=1000000) - record = Entrez.read(handle) - ids = record["IdList"] - total = len(ids) - print(f"Total number of records: {total}") - for i in range(0, total, batch_size): - print(f"Downloading records {i+1}-{min(i+batch_size, total)}") - ids_batch = ids[i:i+batch_size] - handle = Entrez.efetch(db="pubmed", id=",".join(ids_batch), rettype="medline", retmode="text") - data = handle.read() - # Do something with the data, e.g., save it to a file - with open("data.txt", "a", encoding='utf-8') as f: - f.write(data) - handle.close() - time.sleep(delay) - -download_data("zinc") \ No newline at end of file + +def fetch_pubmed_data(query, max_results=1000000): + base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key + search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax=10000&api_key={api_key}" + fetch_url = f"{base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&api_key={api_key}" + + # Perform the initial search + search_response = requests.get(search_url) + if search_response.status_code != 200: + print("Failed to retrieve data") + return + + search_results = search_response.text + id_list = search_results.split('')[1].split('')[0] + id_list = id_list.strip().split() + + print(f"Found {len(id_list)} records, fetching data...") + + # Fetch details of all IDs + records = [] + for start in range(0, len(id_list), 500): # PubMed allows fetching up to 500 records at a time + end = min(start + 500, len(id_list)) + ids = ','.join(id_list[start:end]) + fetch_response = requests.get(f"{fetch_url}&id={ids}") + if fetch_response.status_code == 200: + records.append(fetch_response.text) + else: + print("Failed to fetch data for some records.") + time.sleep(0.5) # to prevent hitting rate limit + + return records + +# Example usage +topic = "zinc" +downloaded_data = fetch_pubmed_data(topic) + +# Optionally, save the data to a file +with open("pubmed_data.txt", "w") as file: + for record in downloaded_data: + file.write(record) diff --git a/pubmed_data.txt b/pubmed_data.txt new file mode 100644 index 0000000..e69de29 diff --git a/results/baseline_doc/pubmed.zinc.0.full.txt b/results/baseline_doc/pubmed.zinc.0.full.txt new file mode 100644 index 0000000..e69de29 diff --git a/results/baseline_doc/pubmed.zinc.2.15.txt b/results/baseline_doc/pubmed.zinc.2.15.txt new file mode 100644 index 0000000..e69de29