Skip to content

Init updated of security #4

Merged
merged 1 commit into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ results/baseline_doc/pubmed.zinc.0.full.txt
results/baseline_doc/pubmed.zinc.2.15.txt
results/baseline_doc/zinc AND 2013\[Date\].2013.txt
/results
config.ini
2 changes: 1 addition & 1 deletion code/step1_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up

g2d_starting_point = 0
batch_size = 100
batch_size = 1000
#############################
#####################
gene_end_point = round(query_size/batch_size)
Expand Down
219 changes: 147 additions & 72 deletions code/step1_data_collection_Custom_Luis.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,151 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 21 00:16:25 2020
python -u "/mnt/c/Users/lrm22005/OneDrive - University of Connecticut/Research/ZIP11_Bioinformatic/capsule-3642152/code/step1_data_collection.py" 'zinc' 0 0 './results/zinc'
"""

import os
import pathlib
import sys
import time
import urllib.error

sys.path.append('lib')
from lib.Literature_Data_Collection import literature_data_collection

if len(sys.argv) > 3:
word_query = str(sys.argv[1])
word_end_point = int(sys.argv[2]) # the endpoint of a word-based data collection. for demo-b 100000
gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection for demo-b 50
paths = str(sys.argv[4]) + '/'
elif len(sys.argv) == 3:
word_query = str(sys.argv[1])
paths = str(sys.argv[2]) + '/'

data_dir = os.path.abspath(os.getcwd())
output_dir = os.path.join(data_dir, paths + 'baseline_doc')
document_output_dir = os.path.join(data_dir, paths + 'gene2document')
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(document_output_dir).mkdir(parents=True, exist_ok=True)

email = "lrmercadod@gmail.com" # Replace with your valid email address
api_key = "19bea34a4dbdbc6ef30392cee15943365309"
ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)

# setting up
########### word query based literature data collection #################
gap = 1000
batch = 200
w2d_starting_point = 0
import shutil
import logging
import requests
from Bio import Entrez
from io import BytesIO
import configparser

# Ensure the current working directory is correct
print("Current working directory:", os.getcwd())
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class PubMedDownloader:
def __init__(self, api_key, email, max_records_per_query=9999):
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
self.api_key = api_key
self.email = email
self.max_records_per_query = max_records_per_query
Entrez.email = email # Set email for NCBI E-utilities

def fetch_pubmed_data(self, query, year):
normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
records = []
attempt = 0
max_attempts = 5

while attempt < max_attempts:
try:
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
count = int(search_results['Count'])
logging.info(f"Total records found for the query '{query}': {count}")

if count > 0:
for start in range(0, min(count, self.max_records_per_query), self.max_records_per_query):
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
fetch_response = requests.get(fetch_url, timeout=10)
records.append(fetch_response.text)
logging.info(f"Fetched records starting from {start}")
return self.save_records_to_file(normalized_query, year, records)
else:
logging.info(f"No records found for the query '{query}'")
return []
break
except requests.exceptions.RequestException as e:
attempt += 1
logging.error(f"Attempt {attempt}: An error occurred: {e}")
time.sleep(2 ** attempt)
return []

def save_records_to_file(self, query, year, records):
directory = os.path.join(".", "results", "baseline_doc")
os.makedirs(directory, exist_ok=True)
filename = f"{query}.{year}.txt"
file_path = os.path.join(directory, filename)
with open(file_path, 'w', encoding='utf-8') as file:
file.write("\n".join(records))
logging.info(f"Saved records to {file_path}")
return file_path

def consolidate_files(self, query):
directory = os.path.join(".", "results", "baseline_doc")
normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
consolidated_file_path = os.path.join(directory, f"{normalized_query}_consolidated.txt")

# Check if there are any files to consolidate
if not os.listdir(directory):
logging.info("No files found in the directory to consolidate.")
return

# Opening the consolidated file outside the loop to write all contents
with open(consolidated_file_path, 'w', encoding='utf-8') as outfile:
# Loop over each file in the directory
for fname in os.listdir(directory):
if fname.startswith(normalized_query) and fname.endswith(".txt") and 'consolidated' not in fname:
filepath = os.path.join(directory, fname)
# Ensure the file is not the consolidated file itself
if filepath != consolidated_file_path:
# Open, read, and close the file
with open(filepath, 'r', encoding='utf-8') as infile:
content = infile.read()
outfile.write(content + "\n")
logging.info(f"Added content from {fname} to the consolidated file.")

# Remove the individual file after its content has been written
try:
os.remove(filepath)
logging.info(f"Removed file {fname} after consolidation.")
except OSError as e:
logging.error(f"Error occurred while removing file {fname}: {e}")

logging.info(f"Consolidated records into {consolidated_file_path}")

# Optional: Clean up the directory if empty
if not os.listdir(directory):
shutil.rmtree(directory)
logging.info("Removed empty directory after consolidation.")

# # Read API key and email from the configuration file
# config = configparser.ConfigParser()
# config.read('config.ini')
# api_key = config.get('pubmed', 'api_key')
# email = config.get('pubmed', 'email')

# Ensure the current working directory is correct
print("Current working directory:", os.getcwd())

config = configparser.ConfigParser()
config_path = 'config.ini' # Make sure this path is correct

# Check if the config file exists to rule out path issues
if not os.path.exists(config_path):
print(f"Configuration file not found at {config_path}")
else:
print(f"Configuration file found at {config_path}")

try:
search_results, _word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
print('The number of available abstracts:', _word_end_point, 'for', word_query)

if int(sys.argv[2]) == 0:
word_end_point = _word_end_point

ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point,
ixs=w2d_starting_point, test_end_point=word_end_point)
except urllib.error.HTTPError as e:
print(f"An HTTP error occurred: {e}")
print("Retrying in 5 seconds...")
time.sleep(5)
# Retry the request or handle the error appropriately

########### gene name-query based literature data collection #################
query_full = ld.text_open('./data/gene_name_info/query_full_name.txt')
query_symbol = ld.text_open('./data/gene_name_info/query_symbol.txt')
# gene name list
query_size = len(query_full)
ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up

g2d_starting_point = 0
batch_size = 10

############################
gene_end_point = round(query_size / batch_size)

if len(sys.argv) > 2:
gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection

if int(sys.argv[3]) == 0:
gene_end_point = round(query_size / batch_size)

ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point,
query_len=len(query_full), end_point=gene_end_point)
config.read(config_path)
# Explicitly list sections and keys
print("Sections available:", config.sections())
# Attempt to read the API key and email from the 'DEFAULT' section
api_key = config.get('DEFAULT', 'api_key')
email = config.get('DEFAULT', 'email')
except configparser.NoSectionError as e:
print(f"Missing section in your configuration file: {e}")
except configparser.NoOptionError as e:
print(f"Missing option in your configuration file: {e}")
except Exception as e:
print(f"An error occurred while reading the configuration file: {e}")

# Create an instance of PubMedDownloader
downloader = PubMedDownloader(api_key, email)

# Define the topic and year range
topic = "gene expression"
start_year = 1990
end_year = 2024

# Fetch and save records by year, then consolidate and clean up
for year in range(start_year, end_year + 1):
year_query = f"{topic} AND {year}[Date]"
downloader.fetch_pubmed_data(year_query, year)

# Consolidate all files into one
downloader.consolidate_files(topic)
Loading