diff --git a/.gitignore b/.gitignore
index 9f9b25a..8dc7703 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,4 @@ results/baseline_doc/pubmed.zinc.0.full.txt
 results/baseline_doc/pubmed.zinc.2.15.txt
 results/baseline_doc/zinc AND 2013\[Date\].2013.txt
 /results
+config.ini
diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py
index e01710c..863cfc6 100644
--- a/code/step1_data_collection.py
+++ b/code/step1_data_collection.py
@@ -52,7 +52,7 @@
 ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up
 
 g2d_starting_point = 0 
-batch_size = 100
+batch_size = 1000
 #############################
 #####################
 gene_end_point = round(query_size/batch_size)
diff --git a/code/step1_data_collection_Custom_Luis.py b/code/step1_data_collection_Custom_Luis.py
index 6f86893..5f61509 100644
--- a/code/step1_data_collection_Custom_Luis.py
+++ b/code/step1_data_collection_Custom_Luis.py
@@ -1,76 +1,151 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Jun 21 00:16:25 2020
-python -u "/mnt/c/Users/lrm22005/OneDrive - University of Connecticut/Research/ZIP11_Bioinformatic/capsule-3642152/code/step1_data_collection.py" 'zinc' 0 0 './results/zinc'
-"""
-
 import os
-import pathlib
-import sys
 import time
-import urllib.error
-
-sys.path.append('lib')
-from lib.Literature_Data_Collection import literature_data_collection
-
-if len(sys.argv) > 3:
-    word_query = str(sys.argv[1])
-    word_end_point = int(sys.argv[2])  # the endpoint of a word-based data collection. for demo-b 100000
-    gene_end_point = int(sys.argv[3])  # the endpoint of gene name-based data collection for demo-b 50
-    paths = str(sys.argv[4]) + '/'
-elif len(sys.argv) == 3:
-    word_query = str(sys.argv[1])
-    paths = str(sys.argv[2]) + '/'
-
-data_dir = os.path.abspath(os.getcwd())
-output_dir = os.path.join(data_dir, paths + 'baseline_doc')
-document_output_dir = os.path.join(data_dir, paths + 'gene2document')
-pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
-pathlib.Path(document_output_dir).mkdir(parents=True, exist_ok=True)
-
-email = "lrmercadod@gmail.com"  # Replace with your valid email address
-api_key = "19bea34a4dbdbc6ef30392cee15943365309"
-ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)
-
-# setting up
-########### word query based literature data collection #################
-gap = 1000
-batch = 200
-w2d_starting_point = 0
+import shutil
+import logging
+import requests
+from Bio import Entrez
+from io import BytesIO
+import configparser
+
+# Ensure the current working directory is correct
+print("Current working directory:", os.getcwd())
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+class PubMedDownloader:
+    def __init__(self, api_key, email, max_records_per_query=9999):
+        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        self.api_key = api_key
+        self.email = email
+        self.max_records_per_query = max_records_per_query
+        Entrez.email = email  # Set email for NCBI E-utilities
+
+    def fetch_pubmed_data(self, query, year):
+        normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
+        records = []
+        attempt = 0
+        max_attempts = 5
+
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    webenv = search_results['WebEnv']
+                    query_key = search_results['QueryKey']
+                    count = int(search_results['Count'])
+                    logging.info(f"Total records found for the query '{query}': {count}")
+
+                    if count > 0:
+                        for start in range(0, min(count, self.max_records_per_query), self.max_records_per_query):
+                            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                            fetch_response = requests.get(fetch_url, timeout=10)
+                            records.append(fetch_response.text)
+                            logging.info(f"Fetched records starting from {start}")
+                        return self.save_records_to_file(normalized_query, year, records)
+                    else:
+                        logging.info(f"No records found for the query '{query}'")
+                        return []
+                break
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                logging.error(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)
+        return []
+
+    def save_records_to_file(self, query, year, records):
+        directory = os.path.join(".", "results", "baseline_doc")
+        os.makedirs(directory, exist_ok=True)
+        filename = f"{query}.{year}.txt"
+        file_path = os.path.join(directory, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))
+        logging.info(f"Saved records to {file_path}")
+        return file_path
+
+    def consolidate_files(self, query):
+        directory = os.path.join(".", "results", "baseline_doc")
+        normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
+        consolidated_file_path = os.path.join(directory, f"{normalized_query}_consolidated.txt")
+
+        # Check if there are any files to consolidate
+        if not os.listdir(directory):
+            logging.info("No files found in the directory to consolidate.")
+            return
+
+        # Opening the consolidated file outside the loop to write all contents
+        with open(consolidated_file_path, 'w', encoding='utf-8') as outfile:
+            # Loop over each file in the directory
+            for fname in os.listdir(directory):
+                if fname.startswith(normalized_query) and fname.endswith(".txt") and 'consolidated' not in fname:
+                    filepath = os.path.join(directory, fname)
+                    # Ensure the file is not the consolidated file itself
+                    if filepath != consolidated_file_path:
+                        # Open, read, and close the file
+                        with open(filepath, 'r', encoding='utf-8') as infile:
+                            content = infile.read()
+                            outfile.write(content + "\n")
+                            logging.info(f"Added content from {fname} to the consolidated file.")
+
+                        # Remove the individual file after its content has been written
+                        try:
+                            os.remove(filepath)
+                            logging.info(f"Removed file {fname} after consolidation.")
+                        except OSError as e:
+                            logging.error(f"Error occurred while removing file {fname}: {e}")
+
+            logging.info(f"Consolidated records into {consolidated_file_path}")
+
+        # Optional: Clean up the directory if empty
+        if not os.listdir(directory):
+            shutil.rmtree(directory)
+            logging.info("Removed empty directory after consolidation.")
+
+# # Read API key and email from the configuration file
+# config = configparser.ConfigParser()
+# config.read('config.ini')
+# api_key = config.get('pubmed', 'api_key')
+# email = config.get('pubmed', 'email')
+
+# Ensure the current working directory is correct
+print("Current working directory:", os.getcwd())
+
+config = configparser.ConfigParser()
+config_path = 'config.ini'  # Make sure this path is correct
+
+# Check if the config file exists to rule out path issues
+if not os.path.exists(config_path):
+    print(f"Configuration file not found at {config_path}")
+else:
+    print(f"Configuration file found at {config_path}")
 
 try:
-    search_results, _word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
-    print('The number of available abstracts:', _word_end_point, 'for', word_query)
-    
-    if int(sys.argv[2]) == 0:
-        word_end_point = _word_end_point
-    
-    ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point,
-                                             ixs=w2d_starting_point, test_end_point=word_end_point)
-except urllib.error.HTTPError as e:
-    print(f"An HTTP error occurred: {e}")
-    print("Retrying in 5 seconds...")
-    time.sleep(5)
-    # Retry the request or handle the error appropriately
-
-########### gene name-query based literature data collection #################
-query_full = ld.text_open('./data/gene_name_info/query_full_name.txt')
-query_symbol = ld.text_open('./data/gene_name_info/query_symbol.txt')
-# gene name list
-query_size = len(query_full)
-ld.gene_based_query_fit(query_size, query_full, query_symbol)  # setting up
-
-g2d_starting_point = 0
-batch_size = 10
-
-############################
-gene_end_point = round(query_size / batch_size)
-
-if len(sys.argv) > 2:
-    gene_end_point = int(sys.argv[3])  # the endpoint of gene name-based data collection
-
-if int(sys.argv[3]) == 0:
-    gene_end_point = round(query_size / batch_size)
-
-ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point,
-                                         query_len=len(query_full), end_point=gene_end_point)
\ No newline at end of file
+    config.read(config_path)
+    # Explicitly list sections and keys
+    print("Sections available:", config.sections())
+    # Attempt to read the API key and email from the 'DEFAULT' section
+    api_key = config.get('DEFAULT', 'api_key')
+    email = config.get('DEFAULT', 'email')
+except configparser.NoSectionError as e:
+    print(f"Missing section in your configuration file: {e}")
+except configparser.NoOptionError as e:
+    print(f"Missing option in your configuration file: {e}")
+except Exception as e:
+    print(f"An error occurred while reading the configuration file: {e}")
+
+# Create an instance of PubMedDownloader
+downloader = PubMedDownloader(api_key, email)
+
+# Define the topic and year range
+topic = "gene expression"
+start_year = 1990
+end_year = 2024
+
+# Fetch and save records by year, then consolidate and clean up
+for year in range(start_year, end_year + 1):
+    year_query = f"{topic} AND {year}[Date]"
+    downloader.fetch_pubmed_data(year_query, year)
+
+# Consolidate all files into one
+downloader.consolidate_files(topic)
\ No newline at end of file
diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py
index eb607da..ca511f1 100644
--- a/code/step_1_data_collection_Luis.py
+++ b/code/step_1_data_collection_Luis.py
@@ -24,20 +24,20 @@
 """
 import requests
 from Bio import Entrez
-from io import StringIO
-from Bio import Medline
 from io import BytesIO
 import time
 import os
+import shutil
 
 class PubMedDownloader:
     def __init__(self, api_key, email):
         self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
         self.api_key = api_key
         self.email = email
-        Entrez.email = email  # Setting email for Biopython Entrez
+        Entrez.email = email  # Set email for NCBI E-utilities
 
     def fetch_pubmed_data(self, query, year, max_records_per_query=9999):
+        normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
         records = []
         attempt = 0
         max_attempts = 5
@@ -53,95 +53,75 @@ def fetch_pubmed_data(self, query, year, max_records_per_query=9999):
                     count = int(search_results['Count'])
                     print(f"Total records found for the query '{query}': {count}")
 
-                    for start in range(0, min(count, max_records_per_query), max_records_per_query):
-                        fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
-                        fetch_response = requests.get(fetch_url, timeout=10)
-                        records.append(fetch_response.text)
-                        print(f"Fetched records starting from {start}")
-                    break
-                else:
-                    print(f"Failed to initiate search with status {search_response.status_code}")
+                    if count > 0:
+                        for start in range(0, min(count, max_records_per_query), max_records_per_query):
+                            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                            fetch_response = requests.get(fetch_url, timeout=10)
+                            records.append(fetch_response.text)
+                            print(f"Fetched records starting from {start}")
+                        return self.save_records_to_file(normalized_query, year, records)
+                break
             except requests.exceptions.RequestException as e:
                 attempt += 1
                 print(f"Attempt {attempt}: An error occurred: {e}")
-                time.sleep(2 ** attempt)  # Exponential backoff
-        # Save records to a file
-        self.save_records_to_file(query, year, records)
-        return records
+                time.sleep(2 ** attempt)
+        return None
 
     def save_records_to_file(self, query, year, records):
-        directory = f"./results/baseline_doc"
-        os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
+        directory = "./results/baseline_doc/"
+        os.makedirs(directory, exist_ok=True)
         filename = f"{query}.{year}.txt"
         file_path = os.path.join(directory, filename)
         with open(file_path, 'w', encoding='utf-8') as file:
-            file.write("\n".join(records))  # Each record is separated by a newline
+            file.write("\n".join(records))
         print(f"Saved records to {file_path}")
+        return file_path
 
-class ids_pubmed():
-    def __init__(self):
-        self.snp_ids = []
-        self.uids = []
-        self.gene_names = []
-        self.names = []
-        self.records = []
-        self.gene_full_names = []
-        self.saved_snp_id = []
-
-    def search_ids(self, search_email):
-        removal_index = []
-        Entrez.email = search_email
-        records = []
-        for snp_id in self.snp_ids:
-            record = Entrez.read(Entrez.elink(dbfrom="snp", 
-                                  id=snp_id.replace('rs', ''), 
-                                  db="gene")) 
-            if record[0]['LinkSetDb'] == []:
-                removal_index.append(snp_id)
-                print("index is removed: ", snp_id)
-                
-            else:
-                results = record[0]['LinkSetDb'][0]['Link']
-                multi_gene = []
-                multi_full_name = []
-                multi_uid = []
-                for result in results:
-                    uid = result['Id']
-                    handle = Entrez.esummary(db="gene", id=uid)
-                    uid_record = Entrez.read(handle)
-                    
-                    records.append(uid_record)
-                    handle.close()
-                    uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0]
-                    gene_name = uid_summary['Name']
-                    gene_full_name = uid_summary['Description']
-                    if len(results) > 1:
-                        multi_gene.append(gene_name)
-                        multi_full_name.append(gene_full_name)
-                        multi_uid.append(uid)
-                    else:
-                        multi_gene = gene_name
-                        multi_full_name = gene_full_name
-                        multi_uid = uid
-            
-                if len(results) > 1:
-                    multi_uid = "#".join(multi_uid)
-                    multi_gene = "#".join(multi_gene)
-                    multi_full_name = "#".join(multi_full_name)
-                
-                self.uids.append(multi_uid)
-                self.gene_names.append(multi_gene)
-                self.gene_full_names.append(multi_full_name)
-                self.saved_snp_id.append(snp_id)
-        return removal_index, records, self.uids, self.gene_names, self.gene_full_names
-
-# Example usage:
-api_key = "19bea34a4dbdbc6ef30392cee15943365309"  # Replace with your actual NCBI API key
-email = "lrmercadod@gmail.com"  # Replace with your email
+    def consolidate_files(self, query):
+        directory = "./results/baseline_doc/"
+        normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
+        consolidated_file_path = f"./results/baseline_doc/{normalized_query}_consolidated.txt"
+
+        # Check if there are any files to consolidate
+        if not os.listdir(directory):
+            print("No files found in the directory to consolidate.")
+            return
+
+        # Opening the consolidated file outside the loop to write all contents
+        with open(consolidated_file_path, 'w', encoding='utf-8') as outfile:
+            # Loop over each file in the directory
+            for fname in os.listdir(directory):
+                if fname.startswith(normalized_query) and fname.endswith(".txt") and 'consolidated' not in fname:
+                    filepath = os.path.join(directory, fname)
+                    # Ensure the file is not the consolidated file itself
+                    if filepath != consolidated_file_path:
+                        # Open, read, and close the file
+                        with open(filepath, 'r', encoding='utf-8') as infile:
+                            content = infile.read()
+                            outfile.write(content + "\n")
+                            print(f"Added content from {fname} to the consolidated file.")
+
+                        # Remove the individual file after its content has been written
+                        os.remove(filepath)
+                        print(f"Removed file {fname} after consolidation.")
 
+            print(f"Consolidated records into {consolidated_file_path}")
+
+        # Optional: Clean up the directory if empty
+        if not os.listdir(directory):
+            shutil.rmtree(directory)
+            print("Removed empty directory after consolidation.")
+
+api_key = "19bea34a4dbdbc6ef30392cee15943365309"
+email = "lrmercadod@gmail.com"
 downloader = PubMedDownloader(api_key, email)
-topic = "zinc"  # Define the topic of interest
-# Fetch and save records by year
-for year in range(1990, 2023):  # Example range of years
+topic = "zinc"
+
+# Fetch and save records by year, then consolidate and clean up
+for year in range(1990, 2025):
     year_query = f"{topic} AND {year}[Date]"
-    downloader.fetch_pubmed_data(year_query, year)
\ No newline at end of file
+    if not downloader.fetch_pubmed_data(year_query, year):
+        print(f"No data found or failed to fetch for {year_query}")
+
+# Consolidate all files into one
+downloader.consolidate_files(topic)
\ No newline at end of file
diff --git a/code/step_1_data_collection_Luis_.py b/code/step_1_data_collection_Luis_.py
index 3c2fa52..9768f7d 100644
--- a/code/step_1_data_collection_Luis_.py
+++ b/code/step_1_data_collection_Luis_.py
@@ -1,43 +1,81 @@
 import requests
+from Bio import Entrez
+from io import BytesIO
 import time
+import os
+import shutil
 
-def fetch_pubmed_data(query, max_results=1000000):
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-    api_key = "19bea34a4dbdbc6ef30392cee15943365309"  # Replace with your actual NCBI API key
-    search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax=10000&api_key={api_key}"
-    fetch_url = f"{base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&api_key={api_key}"
-
-    # Perform the initial search
-    search_response = requests.get(search_url)
-    if search_response.status_code != 200:
-        print("Failed to retrieve data")
-        return
-
-    search_results = search_response.text
-    id_list = search_results.split('<IdList>')[1].split('</IdList>')[0]
-    id_list = id_list.strip().split()
-
-    print(f"Found {len(id_list)} records, fetching data...")
-
-    # Fetch details of all IDs
-    records = []
-    for start in range(0, len(id_list), 500):  # PubMed allows fetching up to 500 records at a time
-        end = min(start + 500, len(id_list))
-        ids = ','.join(id_list[start:end])
-        fetch_response = requests.get(f"{fetch_url}&id={ids}")
-        if fetch_response.status_code == 200:
-            records.append(fetch_response.text)
-        else:
-            print("Failed to fetch data for some records.")
-        time.sleep(0.5)  # to prevent hitting rate limit
-
-    return records
-
-# Example usage
-topic = "zinc"  
-downloaded_data = fetch_pubmed_data(topic)
-
-# Optionally, save the data to a file
-with open("pubmed_data.txt", "w") as file:
-    for record in downloaded_data:
-        file.write(record)
+class PubMedDownloader:
+    def __init__(self, api_key, email):
+        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        self.api_key = api_key
+        self.email = email
+        Entrez.email = email  # Set email for NCBI E-utilities
+
+    def fetch_pubmed_data(self, query, year, max_records_per_query=9999):
+        normalized_query = query.replace(" ", "_").replace("[", "").replace("]", "")
+        records = []
+        attempt = 0
+        max_attempts = 5
+
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query} AND open access[filter]&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    webenv = search_results['WebEnv']
+                    query_key = search_results['QueryKey']
+                    count = int(search_results['Count'])
+                    print(f"Total records found for the query '{query}': {count}")
+
+                    if count > 0:
+                        for start in range(0, min(count, max_records_per_query), max_records_per_query):
+                            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                            fetch_response = requests.get(fetch_url, timeout=10)
+                            records.append(fetch_response.text)
+                            print(f"Fetched records starting from {start}")
+                        return self.save_records_to_file(normalized_query, year, records)
+                break
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                print(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)
+        return None
+
+    def save_records_to_file(self, query, year, records):
+        directory = "./results/baseline_doc/"
+        os.makedirs(directory, exist_ok=True)
+        filename = f"{query}.{year}.txt"
+        file_path = os.path.join(directory, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))
+        print(f"Saved records to {file_path}")
+        return file_path
+
+    def consolidate_files(self, query):
+        directory = "./results/baseline_doc/"
+        consolidated_file_path = f"./results/baseline_doc/{query}_consolidated.txt"
+        with open(consolidated_file_path, 'w', encoding='utf-8') as outfile:
+            for fname in os.listdir(directory):
+                if fname.startswith(query) and fname.endswith(".txt"):
+                    filepath = os.path.join(directory, fname)
+                    with open(filepath, 'r', encoding='utf-8') as infile:
+                        outfile.write(infile.read() + "\n")
+                    os.remove(filepath)  # Remove the file after consolidating
+            print(f"Consolidated records into {consolidated_file_path}")
+
+# Usage example:
+api_key = "19bea34a4dbdbc6ef30392cee15943365309"
+email = "lrmercadod@gmail.com"
+downloader = PubMedDownloader(api_key, email)
+topic = "zinc"
+
+# Fetch and save records by year, then consolidate and clean up
+for year in range(1990, 2025):
+    year_query = f"{topic} AND {year}[Date]"
+    if not downloader.fetch_pubmed_data(year_query, year):
+        print(f"No data found or failed to fetch for {year_query}")
+
+# Consolidate all files into one
+downloader.consolidate_files(topic)