Skip to content

Commit

Permalink
Init updated of security
Browse files Browse the repository at this point in the history
Changes on files and optimization of downloaders.
  • Loading branch information
lrm22005 committed May 7, 2024
1 parent 0e4993d commit 860d0fe
Show file tree
Hide file tree
Showing 5 changed files with 289 additions and 195 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ results/baseline_doc/pubmed.zinc.0.full.txt
results/baseline_doc/pubmed.zinc.2.15.txt
results/baseline_doc/zinc AND 2013\[Date\].2013.txt
/results
config.ini
2 changes: 1 addition & 1 deletion code/step1_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up

g2d_starting_point = 0
batch_size = 100
batch_size = 1000
#############################
#####################
gene_end_point = round(query_size/batch_size)
Expand Down
219 changes: 147 additions & 72 deletions code/step1_data_collection_Custom_Luis.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,151 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 21 00:16:25 2020
python -u "/mnt/c/Users/lrm22005/OneDrive - University of Connecticut/Research/ZIP11_Bioinformatic/capsule-3642152/code/step1_data_collection.py" 'zinc' 0 0 './results/zinc'
"""

import os
import pathlib
import sys
import time
import urllib.error

sys.path.append('lib')
from lib.Literature_Data_Collection import literature_data_collection

if len(sys.argv) > 3:
word_query = str(sys.argv[1])
word_end_point = int(sys.argv[2]) # the endpoint of a word-based data collection. for demo-b 100000
gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection for demo-b 50
paths = str(sys.argv[4]) + '/'
elif len(sys.argv) == 3:
word_query = str(sys.argv[1])
paths = str(sys.argv[2]) + '/'

data_dir = os.path.abspath(os.getcwd())
output_dir = os.path.join(data_dir, paths + 'baseline_doc')
document_output_dir = os.path.join(data_dir, paths + 'gene2document')
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(document_output_dir).mkdir(parents=True, exist_ok=True)

email = "lrmercadod@gmail.com" # Replace with your valid email address
api_key = "19bea34a4dbdbc6ef30392cee15943365309"
ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)

# setting up
########### word query based literature data collection #################
gap = 1000
batch = 200
w2d_starting_point = 0
import shutil
import logging
import requests
from Bio import Entrez
from io import BytesIO
import configparser

# Ensure the current working directory is correct
print("Current working directory:", os.getcwd())
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class PubMedDownloader:
def __init__(self, api_key, email, max_records_per_query=9999):
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
self.api_key = api_key
self.email = email
self.max_records_per_query = max_records_per_query
Entrez.email = email # Set email for NCBI E-utilities

def fetch_pubmed_data(self, query, year):
normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
records = []
attempt = 0
max_attempts = 5

while attempt < max_attempts:
try:
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
logging.info(f"Total records found for the query '{query}': {count}")

if count > 0:
for start in range(0, min(count, self.max_records_per_query), self.max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
logging.info(f"Fetched records starting from {start}")
return self.save_records_to_file(normalized_query, year, records)
else:
logging.info(f"No records found for the query '{query}'")
return []
break
except requests.exceptions.RequestException as e:
attempt += 1
logging.error(f"Attempt {attempt}: An error occurred: {e}")
time.sleep(2 ** attempt)
return []

def save_records_to_file(self, query, year, records):
directory = os.path.join(".", "results", "baseline_doc")
os.makedirs(directory, exist_ok=True)
filename = f"{query}.{year}.txt"
file_path = os.path.join(directory, filename)
with open(file_path, 'w', encoding='utf-8') as file:
file.write("\n".join(records))
logging.info(f"Saved records to {file_path}")
return file_path

def consolidate_files(self, query):
directory = os.path.join(".", "results", "baseline_doc")
normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
consolidated_file_path = os.path.join(directory, f"{normalized_query}_consolidated.txt")

# Check if there are any files to consolidate
if not os.listdir(directory):
logging.info("No files found in the directory to consolidate.")
return

# Opening the consolidated file outside the loop to write all contents
with open(consolidated_file_path, 'w', encoding='utf-8') as outfile:
# Loop over each file in the directory
for fname in os.listdir(directory):
if fname.startswith(normalized_query) and fname.endswith(".txt") and 'consolidated' not in fname:
filepath = os.path.join(directory, fname)
# Ensure the file is not the consolidated file itself
if filepath != consolidated_file_path:
# Open, read, and close the file
with open(filepath, 'r', encoding='utf-8') as infile:
content = infile.read()
outfile.write(content + "\n")
logging.info(f"Added content from {fname} to the consolidated file.")

# Remove the individual file after its content has been written
try:
os.remove(filepath)
logging.info(f"Removed file {fname} after consolidation.")
except OSError as e:
logging.error(f"Error occurred while removing file {fname}: {e}")

logging.info(f"Consolidated records into {consolidated_file_path}")

# Optional: Clean up the directory if empty
if not os.listdir(directory):
shutil.rmtree(directory)
logging.info("Removed empty directory after consolidation.")

# # Read API key and email from the configuration file
# config = configparser.ConfigParser()
# config.read('config.ini')
# api_key = config.get('pubmed', 'api_key')
# email = config.get('pubmed', 'email')

# Ensure the current working directory is correct
print("Current working directory:", os.getcwd())

config = configparser.ConfigParser()
config_path = 'config.ini' # Make sure this path is correct

# Check if the config file exists to rule out path issues
if not os.path.exists(config_path):
print(f"Configuration file not found at {config_path}")
else:
print(f"Configuration file found at {config_path}")

try:
search_results, _word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
print('The number of available abstracts:', _word_end_point, 'for', word_query)

if int(sys.argv[2]) == 0:
word_end_point = _word_end_point

ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point,
ixs=w2d_starting_point, test_end_point=word_end_point)
except urllib.error.HTTPError as e:
print(f"An HTTP error occurred: {e}")
print("Retrying in 5 seconds...")
time.sleep(5)
# Retry the request or handle the error appropriately

########### gene name-query based literature data collection #################
query_full = ld.text_open('./data/gene_name_info/query_full_name.txt')
query_symbol = ld.text_open('./data/gene_name_info/query_symbol.txt')
# gene name list
query_size = len(query_full)
ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up

g2d_starting_point = 0
batch_size = 10

############################
gene_end_point = round(query_size / batch_size)

if len(sys.argv) > 2:
gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection

if int(sys.argv[3]) == 0:
gene_end_point = round(query_size / batch_size)

ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point,
query_len=len(query_full), end_point=gene_end_point)
config.read(config_path)
# Explicitly list sections and keys
print("Sections available:", config.sections())
# Attempt to read the API key and email from the 'DEFAULT' section
api_key = config.get('DEFAULT', 'api_key')
email = config.get('DEFAULT', 'email')
except configparser.NoSectionError as e:
print(f"Missing section in your configuration file: {e}")
except configparser.NoOptionError as e:
print(f"Missing option in your configuration file: {e}")
except Exception as e:
print(f"An error occurred while reading the configuration file: {e}")

# Create an instance of PubMedDownloader
downloader = PubMedDownloader(api_key, email)

# Define the topic and year range
topic = "gene expression"
start_year = 1990
end_year = 2024

# Fetch and save records by year, then consolidate and clean up
for year in range(start_year, end_year + 1):
year_query = f"{topic} AND {year}[Date]"
downloader.fetch_pubmed_data(year_query, year)

# Consolidate all files into one
downloader.consolidate_files(topic)
Loading

0 comments on commit 860d0fe

Please sign in to comment.