Skip to content

Luis updating codes #5

Merged
merged 2 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,6 @@ results/baseline_doc/pubmed.zinc.2.15.txt
results/baseline_doc/zinc AND 2013\[Date\].2013.txt
/results
config.ini
/gene_based_records
checkpoint.json
checkpoint.json
1 change: 1 addition & 0 deletions checkpoint.json

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion code/lib/Loading_PudMed.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from io import StringIO
import time
sys.path.append('lib')
from http.client import IncompleteRead

from Bio import Medline
import os
Expand Down Expand Up @@ -216,7 +217,7 @@ def search_full(self, ix, data_dir, search_results, starting, count, batch):
data = fetch_handle.read()
fetch_handle.close()
out_handle.write(data)
time.sleep(2) # Delay between each batch fetch to respect the API rate limit
time.sleep(5) # Delay between each batch fetch to respect the API rate limit
out_handle.close()

def fetch_rec(self, rec_id, entrez_handle):
Expand Down
Binary file modified code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
Binary file not shown.
4 changes: 2 additions & 2 deletions code/step1_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
sys.path.append('lib')
from lib.Literature_Data_Collection import literature_data_collection

years = 15
years = 35

if len(sys.argv)>3:
word_query = str(sys.argv[1])
Expand Down Expand Up @@ -52,7 +52,7 @@
ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up

g2d_starting_point = 0
batch_size = 1000
batch_size = 100
#############################
#####################
gene_end_point = round(query_size/batch_size)
Expand Down
101 changes: 101 additions & 0 deletions code/step1_data_collection_Luis_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import requests
from Bio import Entrez
from io import BytesIO
import time
import os
import logging
import json

class GenePubMedDownloader:
def __init__(self, api_key, email, output_dir, max_records_per_query=9999, checkpoint_file="checkpoint.json"):
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
self.api_key = api_key
self.email = email
Entrez.email = email # Set email for NCBI E-utilities
self.max_records_per_query = max_records_per_query
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)
self.checkpoint_file = checkpoint_file
self.checkpoint_data = self.load_checkpoint()

def fetch_pubmed_data(self, gene_name):
if gene_name in self.checkpoint_data:
logging.info(f"Skipping {gene_name} (already processed)")
return self.checkpoint_data[gene_name]
normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
records = []
attempt = 0
max_attempts = 5

while attempt < max_attempts:
try:
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
if 'WebEnv' in search_results and 'QueryKey' in search_results:
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
logging.info(f"Total records found for {gene_name}: {count}")
if count > 0:
for start in range(0, count, self.max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
logging.info(f"Fetched records for {gene_name} starting from {start}")
else:
logging.info(f"No records found for {gene_name}.")
return None
else:
logging.error(f"No WebEnv/QueryKey found in the search results for {gene_name}.")
return None
file_path = self.save_records_to_file(normalized_gene, records)
self.checkpoint_data[gene_name] = file_path
self.save_checkpoint()
return file_path
break
except requests.exceptions.RequestException as e:
attempt += 1
logging.error(f"Attempt {attempt}: An error occurred: {e}")
time.sleep(2 ** attempt)
return None

def save_records_to_file(self, gene_name, records):
filename = f"{gene_name}.txt"
file_path = os.path.join(self.output_dir, filename)
with open(file_path, 'w', encoding='utf-8') as file:
file.write("\n".join(records))
logging.info(f"Saved records for {gene_name} to {file_path}")
return file_path

def load_checkpoint(self):
if os.path.exists(self.checkpoint_file):
with open(self.checkpoint_file, 'r') as file:
return json.load(file)
return {}

def save_checkpoint(self):
with open(self.checkpoint_file, 'w') as file:
json.dump(self.checkpoint_data, file)

def load_gene_names(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return [line.strip() for line in file if line.strip()]

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Example Usage
api_key = "19bea34a4dbdbc6ef30392cee15943365309"
email = "lrmercadod@gmail.com"
output_dir = "./gene_based_records/"
downloader = GenePubMedDownloader(api_key, email, output_dir)

# Load gene names and symbols
full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')

# Fetch records for each gene name and symbol
for gene in full_names + symbols:
downloader.fetch_pubmed_data(gene)