Skip to content

Commit

Permalink
Merge pull request #1 from lrm22005/Luis
Browse files Browse the repository at this point in the history
Codes update
  • Loading branch information
lrm22005 authored May 6, 2024
2 parents ae973fe + abfaa4d commit e107ed9
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 142 deletions.
Binary file modified code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
Binary file not shown.
6 changes: 3 additions & 3 deletions code/step1_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)

########### word query based literature data collection #################
gap=10000
batch = 1000
w2d_starting_point = 0
gap=9000
batch = 400
w2d_starting_point = 2

search_results, _word_end_point = ld.word_based_query_fit(year = years, user_term=word_query)
print('The number of avaliable abstracts :', _word_end_point, 'for ', word_query)
Expand Down
223 changes: 104 additions & 119 deletions code/step_1_data_collection_Luis.py
Original file line number Diff line number Diff line change
@@ -1,122 +1,107 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 21 00:16:25 2020
Updated to include robust retry mechanism and API rate limiting
"""

import os
import pathlib
import sys
import time
import urllib.error

# Ensuring the correct append path for 'lib'
sys.path.append(os.path.join(os.path.abspath(os.getcwd()), 'lib'))
from lib.Loading_PudMed import ids_pudmed as pudmed

class literature_data_collection:
def __init__(self, email, output_dir, document_output_dir, api_key=None):
self.output_dir = output_dir
self.document_output_dir = document_output_dir
self.email = email
import requests
from Bio import Entrez
from io import StringIO
from Bio import Medline
from io import BytesIO

class PubMedDownloader:
def __init__(self, api_key, email):
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
self.api_key = api_key
print("Initialized literature_data_collection with email: {}".format(email))

def text_open(self, path):
with open(path, 'r') as f:
data = f.read().strip().split('\n')
return data

def word_based_query_fit(self, year=None, user_term="heart"):
pud = pudmed()
print("Created pudmed instance for searching.")
search_results, end_point = pud.search_list(user_term, year, self.email)
return search_results, end_point

def collecting_doc_using_word_based_query(self, year=None, user_term="heart", gap=50000, starting=0, ixs=0, test_end_point=0):
pud = pudmed()
print("Collecting documents using word-based query.")
search_results, end_point = pud.search_list(user_term, year, self.email)
if test_end_point != 0:
end_point = test_end_point
print('Checking data collection performance --- collecting until', end_point, 'documents')
next_start = starting
for ix in range(ixs, round(end_point/gap) + 1):
next_start = self.robust_request(ix, gap, next_start, end_point, 10000, pud, search_results)
if next_start >= end_point:
break
self.email = email
Entrez.email = email # Setting email for Biopython Entrez

def robust_request(self, ix, gap, starting, end_point, batch, pud, search_results):
success = False
attempts = 0
while not success and attempts < 5:
def fetch_pubmed_data(self, query, batch_size=10000):
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url)
if search_response.status_code == 200:
try:
print(f"{ix} / {end_point // gap} | from {starting} to {min(starting + gap, end_point)}")
pud.search_full(ix, self.output_dir, search_results, starting, min(starting + gap, end_point), batch)
success = True
except urllib.error.HTTPError as e:
attempts += 1
wait_time = 2 ** attempts
print(f"An HTTP error occurred: {e}")
print(f"Retrying in {wait_time} seconds...")
time.sleep(wait_time)

if not success:
print("Failed after 5 attempts, skipping this batch.")
return starting + gap # Returns the next starting point

if __name__ == "__main__":
if len(sys.argv) > 3:
word_query = str(sys.argv[1])
word_end_point = int(sys.argv[2])
gene_end_point = int(sys.argv[3])
paths = str(sys.argv[4]) + '/'
elif len(sys.argv) == 3:
word_query = str(sys.argv[1])
paths = str(sys.argv[2]) + '/'

data_dir = os.path.abspath(os.getcwd())
output_dir = os.path.join(data_dir, paths + 'baseline_doc')
document_output_dir = os.path.join(data_dir, paths + 'gene2document')
os.makedirs(output_dir, exist_ok=True)
os.makedirs(document_output_dir, exist_ok=True)

email = "lrmercadod@gmail.com" # Replace with your valid email address
api_key = "19bea34a4dbdbc6ef30392cee15943365309"
ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)

gap = 50000 # Adjust as needed
batch = 10000 # Adjust as needed
w2d_starting_point = 0 # Adjust if resuming from a different point

try:
search_results, word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
print('The number of available abstracts:', word_end_point, 'for', word_query)

if int(sys.argv[2]) == 0:
word_end_point = word_end_point

ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point, ixs=w2d_starting_point, test_end_point=word_end_point)
except urllib.error.HTTPError as e:
print(f"An HTTP error occurred: {e}")
print("Retrying in 5 seconds...")
time.sleep(5)

# Assuming gene data is prepared and ready to be processed
try:
query_full = ld.text_open('data/gene_name_info/query_full_name.txt') # Adjust path as necessary
query_symbol = ld.text_open('data/gene_name_info/query_symbol.txt') # Adjust path as necessary
query_size = len(query_full)
ld.gene_based_query_fit(query_size, query_full, query_symbol)

g2d_starting_point = 0
batch_size = 10
gene_end_point = round(query_size / batch_size)
if len(sys.argv) > 2:
gene_end_point = int(sys.argv[3])
if int(sys.argv[3]) == 0:
gene_end_point = round(query_size / batch_size)

ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point, query_len=query_size, end_point=gene_end_point)
except Exception as e:
print(f"Error during gene-based data collection: {e}")
# Use BytesIO for binary data
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
print(f"Total records found: {count}")
except Exception as e:
print("Error reading search results:", e)
return []
else:
print("Failed to retrieve search results")
return []

records = []
for start in range(0, count, batch_size):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url)
if fetch_response.status_code == 200:
records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines
print(f"Fetched {start + batch_size} of {count} records")
else:
print(f"Failed to fetch data for batch starting at {start}")

return records

class ids_pubmed():
def __init__(self):
self.snp_ids = []
self.uids = []
self.gene_names = []
self.names = []
self.records = []
self.gene_full_names = []
self.saved_snp_id = []

def search_ids(self, search_email):
removal_index = []
Entrez.email = search_email
records = []
for snp_id in self.snp_ids:
record = Entrez.read(Entrez.elink(dbfrom="snp",
id=snp_id.replace('rs', ''),
db="gene"))
if record[0]['LinkSetDb'] == []:
removal_index.append(snp_id)
print("index is removed: ", snp_id)

else:
results = record[0]['LinkSetDb'][0]['Link']
multi_gene = []
multi_full_name = []
multi_uid = []
for result in results:
uid = result['Id']
handle = Entrez.esummary(db="gene", id=uid)
uid_record = Entrez.read(handle)

records.append(uid_record)
handle.close()
uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0]
gene_name = uid_summary['Name']
gene_full_name = uid_summary['Description']
if len(results) > 1:
multi_gene.append(gene_name)
multi_full_name.append(gene_full_name)
multi_uid.append(uid)
else:
multi_gene = gene_name
multi_full_name = gene_full_name
multi_uid = uid

if len(results) > 1:
multi_uid = "#".join(multi_uid)
multi_gene = "#".join(multi_gene)
multi_full_name = "#".join(multi_full_name)

self.uids.append(multi_uid)
self.gene_names.append(multi_gene)
self.gene_full_names.append(multi_full_name)
self.saved_snp_id.append(snp_id)
return removal_index, records, self.uids, self.gene_names, self.gene_full_names

# Example usage:
api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key
email = "lrmercadod@gmail.com" # Replace with your email

downloader = PubMedDownloader(api_key, email)
topic = "zinc" # Define the topic of interest
pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed
62 changes: 42 additions & 20 deletions code/step_1_data_collection_Luis_.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,43 @@
from Bio import Entrez
import requests
import time
def download_data(query, batch_size=1000, delay=1):
Entrez.email = "your.email@example.com"
handle = Entrez.esearch(db="pubmed", term=query, retmax=1000000)
record = Entrez.read(handle)
ids = record["IdList"]
total = len(ids)
print(f"Total number of records: {total}")
for i in range(0, total, batch_size):
print(f"Downloading records {i+1}-{min(i+batch_size, total)}")
ids_batch = ids[i:i+batch_size]
handle = Entrez.efetch(db="pubmed", id=",".join(ids_batch), rettype="medline", retmode="text")
data = handle.read()
# Do something with the data, e.g., save it to a file
with open("data.txt", "a", encoding='utf-8') as f:
f.write(data)
handle.close()
time.sleep(delay)

download_data("zinc")

def fetch_pubmed_data(query, max_results=1000000):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key
search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax=10000&api_key={api_key}"
fetch_url = f"{base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&api_key={api_key}"

# Perform the initial search
search_response = requests.get(search_url)
if search_response.status_code != 200:
print("Failed to retrieve data")
return

search_results = search_response.text
id_list = search_results.split('<IdList>')[1].split('</IdList>')[0]
id_list = id_list.strip().split()

print(f"Found {len(id_list)} records, fetching data...")

# Fetch details of all IDs
records = []
for start in range(0, len(id_list), 500): # PubMed allows fetching up to 500 records at a time
end = min(start + 500, len(id_list))
ids = ','.join(id_list[start:end])
fetch_response = requests.get(f"{fetch_url}&id={ids}")
if fetch_response.status_code == 200:
records.append(fetch_response.text)
else:
print("Failed to fetch data for some records.")
time.sleep(0.5) # to prevent hitting rate limit

return records

# Example usage
topic = "zinc"
downloaded_data = fetch_pubmed_data(topic)

# Optionally, save the data to a file
with open("pubmed_data.txt", "w") as file:
for record in downloaded_data:
file.write(record)
Empty file added pubmed_data.txt
Empty file.
Empty file.
Empty file.

0 comments on commit e107ed9

Please sign in to comment.