-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Changes on files and optimization of downloaders.
- Loading branch information
Showing
5 changed files
with
289 additions
and
195 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,76 +1,151 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Sun Jun 21 00:16:25 2020 | ||
python -u "/mnt/c/Users/lrm22005/OneDrive - University of Connecticut/Research/ZIP11_Bioinformatic/capsule-3642152/code/step1_data_collection.py" 'zinc' 0 0 './results/zinc' | ||
""" | ||
|
||
import os | ||
import pathlib | ||
import sys | ||
import time | ||
import urllib.error | ||
|
||
sys.path.append('lib') | ||
from lib.Literature_Data_Collection import literature_data_collection | ||
|
||
if len(sys.argv) > 3: | ||
word_query = str(sys.argv[1]) | ||
word_end_point = int(sys.argv[2]) # the endpoint of a word-based data collection. for demo-b 100000 | ||
gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection for demo-b 50 | ||
paths = str(sys.argv[4]) + '/' | ||
elif len(sys.argv) == 3: | ||
word_query = str(sys.argv[1]) | ||
paths = str(sys.argv[2]) + '/' | ||
|
||
data_dir = os.path.abspath(os.getcwd()) | ||
output_dir = os.path.join(data_dir, paths + 'baseline_doc') | ||
document_output_dir = os.path.join(data_dir, paths + 'gene2document') | ||
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) | ||
pathlib.Path(document_output_dir).mkdir(parents=True, exist_ok=True) | ||
|
||
email = "lrmercadod@gmail.com" # Replace with your valid email address | ||
api_key = "19bea34a4dbdbc6ef30392cee15943365309" | ||
ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key) | ||
|
||
# setting up | ||
########### word query based literature data collection ################# | ||
gap = 1000 | ||
batch = 200 | ||
w2d_starting_point = 0 | ||
import shutil | ||
import logging | ||
import requests | ||
from Bio import Entrez | ||
from io import BytesIO | ||
import configparser | ||
|
||
# Ensure the current working directory is correct | ||
print("Current working directory:", os.getcwd()) | ||
# Set up logging | ||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | ||
|
||
class PubMedDownloader: | ||
def __init__(self, api_key, email, max_records_per_query=9999): | ||
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | ||
self.api_key = api_key | ||
self.email = email | ||
self.max_records_per_query = max_records_per_query | ||
Entrez.email = email # Set email for NCBI E-utilities | ||
|
||
def fetch_pubmed_data(self, query, year): | ||
normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "") | ||
records = [] | ||
attempt = 0 | ||
max_attempts = 5 | ||
|
||
while attempt < max_attempts: | ||
try: | ||
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y" | ||
search_response = requests.get(search_url, timeout=10) | ||
if search_response.status_code == 200: | ||
search_results = Entrez.read(BytesIO(search_response.content)) | ||
webenv = search_results['WebEnv'] | ||
query_key = search_results['QueryKey'] | ||
count = int(search_results['Count']) | ||
logging.info(f"Total records found for the query '{query}': {count}") | ||
|
||
if count > 0: | ||
for start in range(0, min(count, self.max_records_per_query), self.max_records_per_query): | ||
fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}" | ||
fetch_response = requests.get(fetch_url, timeout=10) | ||
records.append(fetch_response.text) | ||
logging.info(f"Fetched records starting from {start}") | ||
return self.save_records_to_file(normalized_query, year, records) | ||
else: | ||
logging.info(f"No records found for the query '{query}'") | ||
return [] | ||
break | ||
except requests.exceptions.RequestException as e: | ||
attempt += 1 | ||
logging.error(f"Attempt {attempt}: An error occurred: {e}") | ||
time.sleep(2 ** attempt) | ||
return [] | ||
|
||
def save_records_to_file(self, query, year, records): | ||
directory = os.path.join(".", "results", "baseline_doc") | ||
os.makedirs(directory, exist_ok=True) | ||
filename = f"{query}.{year}.txt" | ||
file_path = os.path.join(directory, filename) | ||
with open(file_path, 'w', encoding='utf-8') as file: | ||
file.write("\n".join(records)) | ||
logging.info(f"Saved records to {file_path}") | ||
return file_path | ||
|
||
def consolidate_files(self, query): | ||
directory = os.path.join(".", "results", "baseline_doc") | ||
normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "") | ||
consolidated_file_path = os.path.join(directory, f"{normalized_query}_consolidated.txt") | ||
|
||
# Check if there are any files to consolidate | ||
if not os.listdir(directory): | ||
logging.info("No files found in the directory to consolidate.") | ||
return | ||
|
||
# Opening the consolidated file outside the loop to write all contents | ||
with open(consolidated_file_path, 'w', encoding='utf-8') as outfile: | ||
# Loop over each file in the directory | ||
for fname in os.listdir(directory): | ||
if fname.startswith(normalized_query) and fname.endswith(".txt") and 'consolidated' not in fname: | ||
filepath = os.path.join(directory, fname) | ||
# Ensure the file is not the consolidated file itself | ||
if filepath != consolidated_file_path: | ||
# Open, read, and close the file | ||
with open(filepath, 'r', encoding='utf-8') as infile: | ||
content = infile.read() | ||
outfile.write(content + "\n") | ||
logging.info(f"Added content from {fname} to the consolidated file.") | ||
|
||
# Remove the individual file after its content has been written | ||
try: | ||
os.remove(filepath) | ||
logging.info(f"Removed file {fname} after consolidation.") | ||
except OSError as e: | ||
logging.error(f"Error occurred while removing file {fname}: {e}") | ||
|
||
logging.info(f"Consolidated records into {consolidated_file_path}") | ||
|
||
# Optional: Clean up the directory if empty | ||
if not os.listdir(directory): | ||
shutil.rmtree(directory) | ||
logging.info("Removed empty directory after consolidation.") | ||
|
||
# # Read API key and email from the configuration file | ||
# config = configparser.ConfigParser() | ||
# config.read('config.ini') | ||
# api_key = config.get('pubmed', 'api_key') | ||
# email = config.get('pubmed', 'email') | ||
|
||
# Ensure the current working directory is correct | ||
print("Current working directory:", os.getcwd()) | ||
|
||
config = configparser.ConfigParser() | ||
config_path = 'config.ini' # Make sure this path is correct | ||
|
||
# Check if the config file exists to rule out path issues | ||
if not os.path.exists(config_path): | ||
print(f"Configuration file not found at {config_path}") | ||
else: | ||
print(f"Configuration file found at {config_path}") | ||
|
||
try: | ||
search_results, _word_end_point = ld.word_based_query_fit(year=None, user_term=word_query) | ||
print('The number of available abstracts:', _word_end_point, 'for', word_query) | ||
|
||
if int(sys.argv[2]) == 0: | ||
word_end_point = _word_end_point | ||
|
||
ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point, | ||
ixs=w2d_starting_point, test_end_point=word_end_point) | ||
except urllib.error.HTTPError as e: | ||
print(f"An HTTP error occurred: {e}") | ||
print("Retrying in 5 seconds...") | ||
time.sleep(5) | ||
# Retry the request or handle the error appropriately | ||
|
||
########### gene name-query based literature data collection ################# | ||
query_full = ld.text_open('./data/gene_name_info/query_full_name.txt') | ||
query_symbol = ld.text_open('./data/gene_name_info/query_symbol.txt') | ||
# gene name list | ||
query_size = len(query_full) | ||
ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up | ||
|
||
g2d_starting_point = 0 | ||
batch_size = 10 | ||
|
||
############################ | ||
gene_end_point = round(query_size / batch_size) | ||
|
||
if len(sys.argv) > 2: | ||
gene_end_point = int(sys.argv[3]) # the endpoint of gene name-based data collection | ||
|
||
if int(sys.argv[3]) == 0: | ||
gene_end_point = round(query_size / batch_size) | ||
|
||
ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point, | ||
query_len=len(query_full), end_point=gene_end_point) | ||
config.read(config_path) | ||
# Explicitly list sections and keys | ||
print("Sections available:", config.sections()) | ||
# Attempt to read the API key and email from the 'DEFAULT' section | ||
api_key = config.get('DEFAULT', 'api_key') | ||
email = config.get('DEFAULT', 'email') | ||
except configparser.NoSectionError as e: | ||
print(f"Missing section in your configuration file: {e}") | ||
except configparser.NoOptionError as e: | ||
print(f"Missing option in your configuration file: {e}") | ||
except Exception as e: | ||
print(f"An error occurred while reading the configuration file: {e}") | ||
|
||
# Create an instance of PubMedDownloader | ||
downloader = PubMedDownloader(api_key, email) | ||
|
||
# Define the topic and year range | ||
topic = "gene expression" | ||
start_year = 1990 | ||
end_year = 2024 | ||
|
||
# Fetch and save records by year, then consolidate and clean up | ||
for year in range(start_year, end_year + 1): | ||
year_query = f"{topic} AND {year}[Date]" | ||
downloader.fetch_pubmed_data(year_query, year) | ||
|
||
# Consolidate all files into one | ||
downloader.consolidate_files(topic) |
Oops, something went wrong.