Skip to content

Commit

Permalink
Merge pull request #5 from lrm22005/Luis
Browse files Browse the repository at this point in the history
Luis updating codes
  • Loading branch information
lrm22005 authored Jul 25, 2024
2 parents 9679d69 + 1613e69 commit d60131f
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,6 @@ results/baseline_doc/pubmed.zinc.2.15.txt
results/baseline_doc/zinc AND 2013\[Date\].2013.txt
/results
config.ini
/gene_based_records
checkpoint.json
checkpoint.json
1 change: 1 addition & 0 deletions checkpoint.json

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion code/lib/Loading_PudMed.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from io import StringIO
import time
sys.path.append('lib')
from http.client import IncompleteRead

from Bio import Medline
import os
Expand Down Expand Up @@ -216,7 +217,7 @@ def search_full(self, ix, data_dir, search_results, starting, count, batch):
data = fetch_handle.read()
fetch_handle.close()
out_handle.write(data)
time.sleep(2) # Delay between each batch fetch to respect the API rate limit
time.sleep(5) # Delay between each batch fetch to respect the API rate limit
out_handle.close()

def fetch_rec(self, rec_id, entrez_handle):
Expand Down
Binary file modified code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
Binary file not shown.
4 changes: 2 additions & 2 deletions code/step1_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
sys.path.append('lib')
from lib.Literature_Data_Collection import literature_data_collection

years = 15
years = 35

if len(sys.argv)>3:
word_query = str(sys.argv[1])
Expand Down Expand Up @@ -52,7 +52,7 @@
ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up

g2d_starting_point = 0
batch_size = 1000
batch_size = 100
#############################
#####################
gene_end_point = round(query_size/batch_size)
Expand Down
101 changes: 101 additions & 0 deletions code/step1_data_collection_Luis_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import requests
from Bio import Entrez
from io import BytesIO
import time
import os
import logging
import json

class GenePubMedDownloader:
def __init__(self, api_key, email, output_dir, max_records_per_query=9999, checkpoint_file="checkpoint.json"):
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
self.api_key = api_key
self.email = email
Entrez.email = email # Set email for NCBI E-utilities
self.max_records_per_query = max_records_per_query
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)
self.checkpoint_file = checkpoint_file
self.checkpoint_data = self.load_checkpoint()

def fetch_pubmed_data(self, gene_name):
if gene_name in self.checkpoint_data:
logging.info(f"Skipping {gene_name} (already processed)")
return self.checkpoint_data[gene_name]
normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
records = []
attempt = 0
max_attempts = 5

while attempt < max_attempts:
try:
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
if 'WebEnv' in search_results and 'QueryKey' in search_results:
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
logging.info(f"Total records found for {gene_name}: {count}")
if count > 0:
for start in range(0, count, self.max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
logging.info(f"Fetched records for {gene_name} starting from {start}")
else:
logging.info(f"No records found for {gene_name}.")
return None
else:
logging.error(f"No WebEnv/QueryKey found in the search results for {gene_name}.")
return None
file_path = self.save_records_to_file(normalized_gene, records)
self.checkpoint_data[gene_name] = file_path
self.save_checkpoint()
return file_path
break
except requests.exceptions.RequestException as e:
attempt += 1
logging.error(f"Attempt {attempt}: An error occurred: {e}")
time.sleep(2 ** attempt)
return None

def save_records_to_file(self, gene_name, records):
filename = f"{gene_name}.txt"
file_path = os.path.join(self.output_dir, filename)
with open(file_path, 'w', encoding='utf-8') as file:
file.write("\n".join(records))
logging.info(f"Saved records for {gene_name} to {file_path}")
return file_path

def load_checkpoint(self):
if os.path.exists(self.checkpoint_file):
with open(self.checkpoint_file, 'r') as file:
return json.load(file)
return {}

def save_checkpoint(self):
with open(self.checkpoint_file, 'w') as file:
json.dump(self.checkpoint_data, file)

def load_gene_names(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return [line.strip() for line in file if line.strip()]

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Example Usage
api_key = "19bea34a4dbdbc6ef30392cee15943365309"
email = "lrmercadod@gmail.com"
output_dir = "./gene_based_records/"
downloader = GenePubMedDownloader(api_key, email, output_dir)

# Load gene names and symbols
full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')

# Fetch records for each gene name and symbol
for gene in full_names + symbols:
downloader.fetch_pubmed_data(gene)
File renamed without changes.

0 comments on commit d60131f

Please sign in to comment.