Skip to content

Codes update #1

Merged
merged 1 commit into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
Binary file not shown.
6 changes: 3 additions & 3 deletions code/step1_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)

########### word query based literature data collection #################
gap=10000
batch = 1000
w2d_starting_point = 0
gap=9000
batch = 400
w2d_starting_point = 2

search_results, _word_end_point = ld.word_based_query_fit(year = years, user_term=word_query)
print('The number of avaliable abstracts :', _word_end_point, 'for ', word_query)
Expand Down
223 changes: 104 additions & 119 deletions code/step_1_data_collection_Luis.py
Original file line number Diff line number Diff line change
@@ -1,122 +1,107 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 21 00:16:25 2020
Updated to include robust retry mechanism and API rate limiting
"""

import os
import pathlib
import sys
import time
import urllib.error

# Ensuring the correct append path for 'lib'
sys.path.append(os.path.join(os.path.abspath(os.getcwd()), 'lib'))
from lib.Loading_PudMed import ids_pudmed as pudmed

class literature_data_collection:
def __init__(self, email, output_dir, document_output_dir, api_key=None):
self.output_dir = output_dir
self.document_output_dir = document_output_dir
self.email = email
import requests
from Bio import Entrez
from io import StringIO
from Bio import Medline
from io import BytesIO

class PubMedDownloader:
def __init__(self, api_key, email):
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
self.api_key = api_key
print("Initialized literature_data_collection with email: {}".format(email))

def text_open(self, path):
with open(path, 'r') as f:
data = f.read().strip().split('\n')
return data

def word_based_query_fit(self, year=None, user_term="heart"):
pud = pudmed()
print("Created pudmed instance for searching.")
search_results, end_point = pud.search_list(user_term, year, self.email)
return search_results, end_point

def collecting_doc_using_word_based_query(self, year=None, user_term="heart", gap=50000, starting=0, ixs=0, test_end_point=0):
pud = pudmed()
print("Collecting documents using word-based query.")
search_results, end_point = pud.search_list(user_term, year, self.email)
if test_end_point != 0:
end_point = test_end_point
print('Checking data collection performance --- collecting until', end_point, 'documents')
next_start = starting
for ix in range(ixs, round(end_point/gap) + 1):
next_start = self.robust_request(ix, gap, next_start, end_point, 10000, pud, search_results)
if next_start >= end_point:
break
self.email = email
Entrez.email = email # Setting email for Biopython Entrez

def robust_request(self, ix, gap, starting, end_point, batch, pud, search_results):
success = False
attempts = 0
while not success and attempts < 5:
def fetch_pubmed_data(self, query, batch_size=10000):
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url)
if search_response.status_code == 200:
try:
print(f"{ix} / {end_point // gap} | from {starting} to {min(starting + gap, end_point)}")
pud.search_full(ix, self.output_dir, search_results, starting, min(starting + gap, end_point), batch)
success = True
except urllib.error.HTTPError as e:
attempts += 1
wait_time = 2 ** attempts
print(f"An HTTP error occurred: {e}")
print(f"Retrying in {wait_time} seconds...")
time.sleep(wait_time)

if not success:
print("Failed after 5 attempts, skipping this batch.")
return starting + gap # Returns the next starting point

if __name__ == "__main__":
if len(sys.argv) > 3:
word_query = str(sys.argv[1])
word_end_point = int(sys.argv[2])
gene_end_point = int(sys.argv[3])
paths = str(sys.argv[4]) + '/'
elif len(sys.argv) == 3:
word_query = str(sys.argv[1])
paths = str(sys.argv[2]) + '/'

data_dir = os.path.abspath(os.getcwd())
output_dir = os.path.join(data_dir, paths + 'baseline_doc')
document_output_dir = os.path.join(data_dir, paths + 'gene2document')
os.makedirs(output_dir, exist_ok=True)
os.makedirs(document_output_dir, exist_ok=True)

email = "lrmercadod@gmail.com" # Replace with your valid email address
api_key = "19bea34a4dbdbc6ef30392cee15943365309"
ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)

gap = 50000 # Adjust as needed
batch = 10000 # Adjust as needed
w2d_starting_point = 0 # Adjust if resuming from a different point

try:
search_results, word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
print('The number of available abstracts:', word_end_point, 'for', word_query)

if int(sys.argv[2]) == 0:
word_end_point = word_end_point

ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point, ixs=w2d_starting_point, test_end_point=word_end_point)
except urllib.error.HTTPError as e:
print(f"An HTTP error occurred: {e}")
print("Retrying in 5 seconds...")
time.sleep(5)

# Assuming gene data is prepared and ready to be processed
try:
query_full = ld.text_open('data/gene_name_info/query_full_name.txt') # Adjust path as necessary
query_symbol = ld.text_open('data/gene_name_info/query_symbol.txt') # Adjust path as necessary
query_size = len(query_full)
ld.gene_based_query_fit(query_size, query_full, query_symbol)

g2d_starting_point = 0
batch_size = 10
gene_end_point = round(query_size / batch_size)
if len(sys.argv) > 2:
gene_end_point = int(sys.argv[3])
if int(sys.argv[3]) == 0:
gene_end_point = round(query_size / batch_size)

ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point, query_len=query_size, end_point=gene_end_point)
except Exception as e:
print(f"Error during gene-based data collection: {e}")
# Use BytesIO for binary data
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
print(f"Total records found: {count}")
except Exception as e:
print("Error reading search results:", e)
return []
else:
print("Failed to retrieve search results")
return []

records = []
for start in range(0, count, batch_size):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url)
if fetch_response.status_code == 200:
records.extend(fetch_response.content.decode('utf-8').split('\n\n')) # Each record separated by two newlines
print(f"Fetched {start + batch_size} of {count} records")
else:
print(f"Failed to fetch data for batch starting at {start}")

return records

class ids_pubmed():
def __init__(self):
self.snp_ids = []
self.uids = []
self.gene_names = []
self.names = []
self.records = []
self.gene_full_names = []
self.saved_snp_id = []

def search_ids(self, search_email):
removal_index = []
Entrez.email = search_email
records = []
for snp_id in self.snp_ids:
record = Entrez.read(Entrez.elink(dbfrom="snp",
id=snp_id.replace('rs', ''),
db="gene"))
if record[0]['LinkSetDb'] == []:
removal_index.append(snp_id)
print("index is removed: ", snp_id)

else:
results = record[0]['LinkSetDb'][0]['Link']
multi_gene = []
multi_full_name = []
multi_uid = []
for result in results:
uid = result['Id']
handle = Entrez.esummary(db="gene", id=uid)
uid_record = Entrez.read(handle)

records.append(uid_record)
handle.close()
uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0]
gene_name = uid_summary['Name']
gene_full_name = uid_summary['Description']
if len(results) > 1:
multi_gene.append(gene_name)
multi_full_name.append(gene_full_name)
multi_uid.append(uid)
else:
multi_gene = gene_name
multi_full_name = gene_full_name
multi_uid = uid

if len(results) > 1:
multi_uid = "#".join(multi_uid)
multi_gene = "#".join(multi_gene)
multi_full_name = "#".join(multi_full_name)

self.uids.append(multi_uid)
self.gene_names.append(multi_gene)
self.gene_full_names.append(multi_full_name)
self.saved_snp_id.append(snp_id)
return removal_index, records, self.uids, self.gene_names, self.gene_full_names

# Example usage:
api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key
email = "lrmercadod@gmail.com" # Replace with your email

downloader = PubMedDownloader(api_key, email)
topic = "zinc" # Define the topic of interest
pubmed_records = downloader.fetch_pubmed_data(topic, 10000) # Adjust batch size as needed
62 changes: 42 additions & 20 deletions code/step_1_data_collection_Luis_.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,43 @@
from Bio import Entrez
import requests
import time
def download_data(query, batch_size=1000, delay=1):
Entrez.email = "your.email@example.com"
handle = Entrez.esearch(db="pubmed", term=query, retmax=1000000)
record = Entrez.read(handle)
ids = record["IdList"]
total = len(ids)
print(f"Total number of records: {total}")
for i in range(0, total, batch_size):
print(f"Downloading records {i+1}-{min(i+batch_size, total)}")
ids_batch = ids[i:i+batch_size]
handle = Entrez.efetch(db="pubmed", id=",".join(ids_batch), rettype="medline", retmode="text")
data = handle.read()
# Do something with the data, e.g., save it to a file
with open("data.txt", "a", encoding='utf-8') as f:
f.write(data)
handle.close()
time.sleep(delay)

download_data("zinc")

def fetch_pubmed_data(query, max_results=1000000):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
api_key = "19bea34a4dbdbc6ef30392cee15943365309" # Replace with your actual NCBI API key
search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax=10000&api_key={api_key}"
fetch_url = f"{base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&api_key={api_key}"

# Perform the initial search
search_response = requests.get(search_url)
if search_response.status_code != 200:
print("Failed to retrieve data")
return

search_results = search_response.text
id_list = search_results.split('<IdList>')[1].split('</IdList>')[0]
id_list = id_list.strip().split()

print(f"Found {len(id_list)} records, fetching data...")

# Fetch details of all IDs
records = []
for start in range(0, len(id_list), 500): # PubMed allows fetching up to 500 records at a time
end = min(start + 500, len(id_list))
ids = ','.join(id_list[start:end])
fetch_response = requests.get(f"{fetch_url}&id={ids}")
if fetch_response.status_code == 200:
records.append(fetch_response.text)
else:
print("Failed to fetch data for some records.")
time.sleep(0.5) # to prevent hitting rate limit

return records

# Example usage
topic = "zinc"
downloaded_data = fetch_pubmed_data(topic)

# Optionally, save the data to a file
with open("pubmed_data.txt", "w") as file:
for record in downloaded_data:
file.write(record)
Empty file added pubmed_data.txt
Empty file.
Empty file.
Empty file.