Init updated of security

Changes on files and optimization of downloaders.
lrm22005 · May 7, 2024 · 860d0fe · 860d0fe
1 parent 0e4993d
commit 860d0fe
Show file tree

Hide file tree

Showing 5 changed files with 289 additions and 195 deletions.
diff --git a/.gitignore b/.gitignore
@@ -37,3 +37,4 @@ results/baseline_doc/pubmed.zinc.0.full.txt
 results/baseline_doc/pubmed.zinc.2.15.txt
 results/baseline_doc/zinc AND 2013\[Date\].2013.txt
 /results
+config.ini
diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py
@@ -52,7 +52,7 @@
 ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up
 
 g2d_starting_point = 0 
-batch_size = 100
+batch_size = 1000
 #############################
 #####################
 gene_end_point = round(query_size/batch_size)

diff --git a/code/step1_data_collection_Custom_Luis.py b/code/step1_data_collection_Custom_Luis.py
@@ -1,76 +1,151 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Jun 21 00:16:25 2020
-python -u "/mnt/c/Users/lrm22005/OneDrive - University of Connecticut/Research/ZIP11_Bioinformatic/capsule-3642152/code/step1_data_collection.py" 'zinc' 0 0 './results/zinc'
-"""
-
 import os
-import pathlib
-import sys
 import time
-import urllib.error
-
-sys.path.append('lib')
-from lib.Literature_Data_Collection import literature_data_collection
-
-if len(sys.argv) > 3:
-    word_query = str(sys.argv[1])
-    word_end_point = int(sys.argv[2])  # the endpoint of a word-based data collection. for demo-b 100000
-    gene_end_point = int(sys.argv[3])  # the endpoint of gene name-based data collection for demo-b 50
-    paths = str(sys.argv[4]) + '/'
-elif len(sys.argv) == 3:
-    word_query = str(sys.argv[1])
-    paths = str(sys.argv[2]) + '/'
-
-data_dir = os.path.abspath(os.getcwd())
-output_dir = os.path.join(data_dir, paths + 'baseline_doc')
-document_output_dir = os.path.join(data_dir, paths + 'gene2document')
-pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
-pathlib.Path(document_output_dir).mkdir(parents=True, exist_ok=True)
-
-email = "lrmercadod@gmail.com"  # Replace with your valid email address
-api_key = "19bea34a4dbdbc6ef30392cee15943365309"
-ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)
-
-# setting up
-########### word query based literature data collection #################
-gap = 1000
-batch = 200
-w2d_starting_point = 0
+import shutil
+import logging
+import requests
+from Bio import Entrez
+from io import BytesIO
+import configparser
+
+# Ensure the current working directory is correct
+print("Current working directory:", os.getcwd())
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+class PubMedDownloader:
+    def __init__(self, api_key, email, max_records_per_query=9999):
+        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        self.api_key = api_key
+        self.email = email
+        self.max_records_per_query = max_records_per_query
+        Entrez.email = email  # Set email for NCBI E-utilities
+
+    def fetch_pubmed_data(self, query, year):
+        normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
+        records = []
+        attempt = 0
+        max_attempts = 5
+
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    webenv = search_results['WebEnv']
+                    query_key = search_results['QueryKey']
+                    count = int(search_results['Count'])
+                    logging.info(f"Total records found for the query '{query}': {count}")
+
+                    if count > 0:
+                        for start in range(0, min(count, self.max_records_per_query), self.max_records_per_query):
+                            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                            fetch_response = requests.get(fetch_url, timeout=10)
+                            records.append(fetch_response.text)
+                            logging.info(f"Fetched records starting from {start}")
+                        return self.save_records_to_file(normalized_query, year, records)
+                    else:
+                        logging.info(f"No records found for the query '{query}'")
+                        return []
+                break
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                logging.error(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)
+        return []
+
+    def save_records_to_file(self, query, year, records):
+        directory = os.path.join(".", "results", "baseline_doc")
+        os.makedirs(directory, exist_ok=True)
+        filename = f"{query}.{year}.txt"
+        file_path = os.path.join(directory, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))
+        logging.info(f"Saved records to {file_path}")
+        return file_path
+
+    def consolidate_files(self, query):
+        directory = os.path.join(".", "results", "baseline_doc")
+        normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
+        consolidated_file_path = os.path.join(directory, f"{normalized_query}_consolidated.txt")
+
+        # Check if there are any files to consolidate
+        if not os.listdir(directory):
+            logging.info("No files found in the directory to consolidate.")
+            return
+
+        # Opening the consolidated file outside the loop to write all contents
+        with open(consolidated_file_path, 'w', encoding='utf-8') as outfile:
+            # Loop over each file in the directory
+            for fname in os.listdir(directory):
+                if fname.startswith(normalized_query) and fname.endswith(".txt") and 'consolidated' not in fname:
+                    filepath = os.path.join(directory, fname)
+                    # Ensure the file is not the consolidated file itself
+                    if filepath != consolidated_file_path:
+                        # Open, read, and close the file
+                        with open(filepath, 'r', encoding='utf-8') as infile:
+                            content = infile.read()
+                            outfile.write(content + "\n")
+                            logging.info(f"Added content from {fname} to the consolidated file.")
+
+                        # Remove the individual file after its content has been written
+                        try:
+                            os.remove(filepath)
+                            logging.info(f"Removed file {fname} after consolidation.")
+                        except OSError as e:
+                            logging.error(f"Error occurred while removing file {fname}: {e}")
+
+            logging.info(f"Consolidated records into {consolidated_file_path}")
+
+        # Optional: Clean up the directory if empty
+        if not os.listdir(directory):
+            shutil.rmtree(directory)
+            logging.info("Removed empty directory after consolidation.")
+
+# # Read API key and email from the configuration file
+# config = configparser.ConfigParser()
+# config.read('config.ini')
+# api_key = config.get('pubmed', 'api_key')
+# email = config.get('pubmed', 'email')
+
+# Ensure the current working directory is correct
+print("Current working directory:", os.getcwd())
+
+config = configparser.ConfigParser()
+config_path = 'config.ini'  # Make sure this path is correct
+
+# Check if the config file exists to rule out path issues
+if not os.path.exists(config_path):
+    print(f"Configuration file not found at {config_path}")
+else:
+    print(f"Configuration file found at {config_path}")
 
 try:
-    search_results, _word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
-    print('The number of available abstracts:', _word_end_point, 'for', word_query)
-
-    if int(sys.argv[2]) == 0:
-        word_end_point = _word_end_point
-
-    ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point,
-                                             ixs=w2d_starting_point, test_end_point=word_end_point)
-except urllib.error.HTTPError as e:
-    print(f"An HTTP error occurred: {e}")
-    print("Retrying in 5 seconds...")
-    time.sleep(5)
-    # Retry the request or handle the error appropriately
-
-########### gene name-query based literature data collection #################
-query_full = ld.text_open('./data/gene_name_info/query_full_name.txt')
-query_symbol = ld.text_open('./data/gene_name_info/query_symbol.txt')
-# gene name list
-query_size = len(query_full)
-ld.gene_based_query_fit(query_size, query_full, query_symbol)  # setting up
-
-g2d_starting_point = 0
-batch_size = 10
-
-############################
-gene_end_point = round(query_size / batch_size)
-
-if len(sys.argv) > 2:
-    gene_end_point = int(sys.argv[3])  # the endpoint of gene name-based data collection
-
-if int(sys.argv[3]) == 0:
-    gene_end_point = round(query_size / batch_size)
-
-ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point,
-                                         query_len=len(query_full), end_point=gene_end_point)
+    config.read(config_path)
+    # Explicitly list sections and keys
+    print("Sections available:", config.sections())
+    # Attempt to read the API key and email from the 'DEFAULT' section
+    api_key = config.get('DEFAULT', 'api_key')
+    email = config.get('DEFAULT', 'email')
+except configparser.NoSectionError as e:
+    print(f"Missing section in your configuration file: {e}")
+except configparser.NoOptionError as e:
+    print(f"Missing option in your configuration file: {e}")
+except Exception as e:
+    print(f"An error occurred while reading the configuration file: {e}")
+
+# Create an instance of PubMedDownloader
+downloader = PubMedDownloader(api_key, email)
+
+# Define the topic and year range
+topic = "gene expression"
+start_year = 1990
+end_year = 2024
+
+# Fetch and save records by year, then consolidate and clean up
+for year in range(start_year, end_year + 1):
+    year_query = f"{topic} AND {year}[Date]"
+    downloader.fetch_pubmed_data(year_query, year)
+
+# Consolidate all files into one
+downloader.consolidate_files(topic)