-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from lrm22005/Luis
Codes update
- Loading branch information
Showing
7 changed files
with
149 additions
and
142 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,122 +1,107 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Sun Jun 21 00:16:25 2020 | ||
Updated to include robust retry mechanism and API rate limiting | ||
""" | ||
|
||
import os | ||
import pathlib | ||
import sys | ||
import time | ||
import urllib.error | ||
|
||
# Ensuring the correct append path for 'lib' | ||
sys.path.append(os.path.join(os.path.abspath(os.getcwd()), 'lib')) | ||
from lib.Loading_PudMed import ids_pudmed as pudmed | ||
|
||
class literature_data_collection: | ||
def __init__(self, email, output_dir, document_output_dir, api_key=None): | ||
self.output_dir = output_dir | ||
self.document_output_dir = document_output_dir | ||
self.email = email | ||
import requests | ||
from Bio import Entrez | ||
from io import StringIO | ||
from Bio import Medline | ||
from io import BytesIO | ||
|
||
class PubMedDownloader: | ||
def __init__(self, api_key, email): | ||
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | ||
self.api_key = api_key | ||
print("Initialized literature_data_collection with email: {}".format(email)) | ||
|
||
def text_open(self, path): | ||
with open(path, 'r') as f: | ||
data = f.read().strip().split('\n') | ||
return data | ||
|
||
def word_based_query_fit(self, year=None, user_term="heart"): | ||
pud = pudmed() | ||
print("Created pudmed instance for searching.") | ||
search_results, end_point = pud.search_list(user_term, year, self.email) | ||
return search_results, end_point | ||
|
||
def collecting_doc_using_word_based_query(self, year=None, user_term="heart", gap=50000, starting=0, ixs=0, test_end_point=0): | ||
pud = pudmed() | ||
print("Collecting documents using word-based query.") | ||
search_results, end_point = pud.search_list(user_term, year, self.email) | ||
if test_end_point != 0: | ||
end_point = test_end_point | ||
print('Checking data collection performance --- collecting until', end_point, 'documents') | ||
next_start = starting | ||
for ix in range(ixs, round(end_point/gap) + 1): | ||
next_start = self.robust_request(ix, gap, next_start, end_point, 10000, pud, search_results) | ||
if next_start >= end_point: | ||
break | ||
self.email = email | ||
Entrez.email = email # Setting email for Biopython Entrez | ||
|
||
def robust_request(self, ix, gap, starting, end_point, batch, pud, search_results): | ||
success = False | ||
attempts = 0 | ||
while not success and attempts < 5: | ||
def fetch_pubmed_data(self, query, batch_size=10000): | ||
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" | ||
search_response = requests.get(search_url) | ||
if search_response.status_code == 200: | ||
try: | ||
print(f"{ix} / {end_point // gap} | from {starting} to {min(starting + gap, end_point)}") | ||
pud.search_full(ix, self.output_dir, search_results, starting, min(starting + gap, end_point), batch) | ||
success = True | ||
except urllib.error.HTTPError as e: | ||
attempts += 1 | ||
wait_time = 2 ** attempts | ||
print(f"An HTTP error occurred: {e}") | ||
print(f"Retrying in {wait_time} seconds...") | ||
time.sleep(wait_time) | ||
|
||
if not success: | ||
print("Failed after 5 attempts, skipping this batch.") | ||
return starting + gap # Returns the next starting point | ||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) > 3: | ||
word_query = str(sys.argv[1]) | ||
word_end_point = int(sys.argv[2]) | ||
gene_end_point = int(sys.argv[3]) | ||
paths = str(sys.argv[4]) + '/' | ||
elif len(sys.argv) == 3: | ||
word_query = str(sys.argv[1]) | ||
paths = str(sys.argv[2]) + '/' | ||
|
||
data_dir = os.path.abspath(os.getcwd()) | ||
output_dir = os.path.join(data_dir, paths + 'baseline_doc') | ||
document_output_dir = os.path.join(data_dir, paths + 'gene2document') | ||
os.makedirs(output_dir, exist_ok=True) | ||
os.makedirs(document_output_dir, exist_ok=True) | ||
|
||
email = "lrmercadod@gmail.com" # Replace with your valid email address | ||
api_key = "19bea34a4dbdbc6ef30392cee15943365309" | ||
ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key) | ||
|
||
gap = 50000 # Adjust as needed | ||
batch = 10000 # Adjust as needed | ||
w2d_starting_point = 0 # Adjust if resuming from a different point | ||
|
||
try: | ||
search_results, word_end_point = ld.word_based_query_fit(year=None, user_term=word_query) | ||
print('The number of available abstracts:', word_end_point, 'for', word_query) | ||
|
||
if int(sys.argv[2]) == 0: | ||
word_end_point = word_end_point | ||
|
||
ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point, ixs=w2d_starting_point, test_end_point=word_end_point) | ||
except urllib.error.HTTPError as e: | ||
print(f"An HTTP error occurred: {e}") | ||
print("Retrying in 5 seconds...") | ||
time.sleep(5) | ||
|
||
# Assuming gene data is prepared and ready to be processed | ||
try: | ||
query_full = ld.text_open('data/gene_name_info/query_full_name.txt') # Adjust path as necessary | ||
query_symbol = ld.text_open('data/gene_name_info/query_symbol.txt') # Adjust path as necessary | ||
query_size = len(query_full) | ||
ld.gene_based_query_fit(query_size, query_full, query_symbol) | ||
|
||
g2d_starting_point = 0 | ||
batch_size = 10 | ||
gene_end_point = round(query_size / batch_size) | ||
if len(sys.argv) > 2: | ||
gene_end_point = int(sys.argv[3]) | ||
if int(sys.argv[3]) == 0: | ||
gene_end_point = round(query_size / batch_size) | ||
|
||
ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point, query_len=query_size, end_point=gene_end_point) | ||
except Exception as e: | ||
print(f"Error during gene-based data collection: {e}") | ||
# Use BytesIO for binary data | ||
search_results = Entrez.read(BytesIO(search_response.content)) | ||
webenv = search_results['WebEnv'] | ||
query_key = search_results['QueryKey'] | ||
count = int(search_results['Count']) | ||
print(f"Total records found: {count}") | ||
except Exception as e: | ||
print("Error reading search results:", e) | ||
return [] | ||
else: | ||
print("Failed to retrieve search results") | ||
return [] | ||
|
||
records = [] | ||
for start in range(0, count, batch_size): | ||
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" | ||
fetch_response = requests.get(fetch_url) | ||
if fetch_response.status_code == 200: | ||
records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines | ||
print(f"Fetched {start + batch_size} of {count} records") | ||
else: | ||
print(f"Failed to fetch data for batch starting at {start}") | ||
|
||
return records | ||
|
||
class ids_pubmed(): | ||
def __init__(self): | ||
self.snp_ids = [] | ||
self.uids = [] | ||
self.gene_names = [] | ||
self.names = [] | ||
self.records = [] | ||
self.gene_full_names = [] | ||
self.saved_snp_id = [] | ||
|
||
def search_ids(self, search_email): | ||
removal_index = [] | ||
Entrez.email = search_email | ||
records = [] | ||
for snp_id in self.snp_ids: | ||
record = Entrez.read(Entrez.elink(dbfrom="snp", | ||
id=snp_id.replace('rs', ''), | ||
db="gene")) | ||
if record[0]['LinkSetDb'] == []: | ||
removal_index.append(snp_id) | ||
print("index is removed: ", snp_id) | ||
|
||
else: | ||
results = record[0]['LinkSetDb'][0]['Link'] | ||
multi_gene = [] | ||
multi_full_name = [] | ||
multi_uid = [] | ||
for result in results: | ||
uid = result['Id'] | ||
handle = Entrez.esummary(db="gene", id=uid) | ||
uid_record = Entrez.read(handle) | ||
|
||
records.append(uid_record) | ||
handle.close() | ||
uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0] | ||
gene_name = uid_summary['Name'] | ||
gene_full_name = uid_summary['Description'] | ||
if len(results) > 1: | ||
multi_gene.append(gene_name) | ||
multi_full_name.append(gene_full_name) | ||
multi_uid.append(uid) | ||
else: | ||
multi_gene = gene_name | ||
multi_full_name = gene_full_name | ||
multi_uid = uid | ||
|
||
if len(results) > 1: | ||
multi_uid = "#".join(multi_uid) | ||
multi_gene = "#".join(multi_gene) | ||
multi_full_name = "#".join(multi_full_name) | ||
|
||
self.uids.append(multi_uid) | ||
self.gene_names.append(multi_gene) | ||
self.gene_full_names.append(multi_full_name) | ||
self.saved_snp_id.append(snp_id) | ||
return removal_index, records, self.uids, self.gene_names, self.gene_full_names | ||
|
||
# Example usage: | ||
api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key | ||
email = "lrmercadod@gmail.com" # Replace with your email | ||
|
||
downloader = PubMedDownloader(api_key, email) | ||
topic = "zinc" # Define the topic of interest | ||
pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,43 @@ | ||
from Bio import Entrez | ||
import requests | ||
import time | ||
def download_data(query, batch_size=1000, delay=1): | ||
Entrez.email = "your.email@example.com" | ||
handle = Entrez.esearch(db="pubmed", term=query, retmax=1000000) | ||
record = Entrez.read(handle) | ||
ids = record["IdList"] | ||
total = len(ids) | ||
print(f"Total number of records: {total}") | ||
for i in range(0, total, batch_size): | ||
print(f"Downloading records {i+1}-{min(i+batch_size, total)}") | ||
ids_batch = ids[i:i+batch_size] | ||
handle = Entrez.efetch(db="pubmed", id=",".join(ids_batch), rettype="medline", retmode="text") | ||
data = handle.read() | ||
# Do something with the data, e.g., save it to a file | ||
with open("data.txt", "a", encoding='utf-8') as f: | ||
f.write(data) | ||
handle.close() | ||
time.sleep(delay) | ||
|
||
download_data("zinc") | ||
|
||
def fetch_pubmed_data(query, max_results=1000000): | ||
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | ||
api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key | ||
search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax=10000&api_key={api_key}" | ||
fetch_url = f"{base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&api_key={api_key}" | ||
|
||
# Perform the initial search | ||
search_response = requests.get(search_url) | ||
if search_response.status_code != 200: | ||
print("Failed to retrieve data") | ||
return | ||
|
||
search_results = search_response.text | ||
id_list = search_results.split('<IdList>')[1].split('</IdList>')[0] | ||
id_list = id_list.strip().split() | ||
|
||
print(f"Found {len(id_list)} records, fetching data...") | ||
|
||
# Fetch details of all IDs | ||
records = [] | ||
for start in range(0, len(id_list), 500): # PubMed allows fetching up to 500 records at a time | ||
end = min(start + 500, len(id_list)) | ||
ids = ','.join(id_list[start:end]) | ||
fetch_response = requests.get(f"{fetch_url}&id={ids}") | ||
if fetch_response.status_code == 200: | ||
records.append(fetch_response.text) | ||
else: | ||
print("Failed to fetch data for some records.") | ||
time.sleep(0.5) # to prevent hitting rate limit | ||
|
||
return records | ||
|
||
# Example usage | ||
topic = "zinc" | ||
downloaded_data = fetch_pubmed_data(topic) | ||
|
||
# Optionally, save the data to a file | ||
with open("pubmed_data.txt", "w") as file: | ||
for record in downloaded_data: | ||
file.write(record) |
Empty file.
Empty file.
Empty file.