lrm22005 · lrm22005 · Jul 25, 2024 · May 8, 2024 · May 14, 2024
diff --git a/.gitignore b/.gitignore
@@ -38,3 +38,6 @@ results/baseline_doc/pubmed.zinc.2.15.txt
 results/baseline_doc/zinc AND 2013\[Date\].2013.txt
 /results
 config.ini
+/gene_based_records
+checkpoint.json
+checkpoint.json
diff --git a/checkpoint.json b/checkpoint.json
diff --git a/code/lib/Loading_PudMed.py b/code/lib/Loading_PudMed.py
@@ -12,6 +12,7 @@
 from io import StringIO
 import time
 sys.path.append('lib')
+from http.client import IncompleteRead
 
 from Bio import Medline
 import os
@@ -216,7 +217,7 @@ def search_full(self, ix, data_dir, search_results, starting, count, batch):
             data = fetch_handle.read()
             fetch_handle.close()
             out_handle.write(data)
-            time.sleep(2)  # Delay between each batch fetch to respect the API rate limit
+            time.sleep(5)  # Delay between each batch fetch to respect the API rate limit
         out_handle.close()
 
     def fetch_rec(self, rec_id, entrez_handle):

diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py
@@ -12,7 +12,7 @@
 sys.path.append('lib')  
 from lib.Literature_Data_Collection import literature_data_collection
 
-years = 15
+years = 35
 
 if len(sys.argv)>3:
     word_query = str(sys.argv[1])
@@ -52,7 +52,7 @@
 ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up
 
 g2d_starting_point = 0 
-batch_size = 1000
+batch_size = 100
 #############################
 #####################
 gene_end_point = round(query_size/batch_size)

diff --git a/code/step1_data_collection_Luis_genes.py b/code/step1_data_collection_Luis_genes.py
@@ -0,0 +1,101 @@
+import requests
+from Bio import Entrez
+from io import BytesIO
+import time
+import os
+import logging
+import json
+
+class GenePubMedDownloader:
+    def __init__(self, api_key, email, output_dir, max_records_per_query=9999, checkpoint_file="checkpoint.json"):
+        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        self.api_key = api_key
+        self.email = email
+        Entrez.email = email  # Set email for NCBI E-utilities
+        self.max_records_per_query = max_records_per_query
+        self.output_dir = output_dir
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.checkpoint_file = checkpoint_file
+        self.checkpoint_data = self.load_checkpoint()
+
+    def fetch_pubmed_data(self, gene_name):
+        if gene_name in self.checkpoint_data:
+            logging.info(f"Skipping {gene_name} (already processed)")
+            return self.checkpoint_data[gene_name]        
+        normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
+        records = []
+        attempt = 0
+        max_attempts = 5
+
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    if 'WebEnv' in search_results and 'QueryKey' in search_results:
+                        webenv = search_results['WebEnv']
+                        query_key = search_results['QueryKey']
+                        count = int(search_results['Count'])
+                        logging.info(f"Total records found for {gene_name}: {count}")
+                        if count > 0:
+                            for start in range(0, count, self.max_records_per_query):
+                                fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax={self.max_records_per_query}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                                fetch_response = requests.get(fetch_url, timeout=10)
+                                records.append(fetch_response.text)
+                                logging.info(f"Fetched records for {gene_name} starting from {start}")
+                        else:
+                            logging.info(f"No records found for {gene_name}.")
+                            return None
+                    else:
+                        logging.error(f"No WebEnv/QueryKey found in the search results for {gene_name}.")
+                        return None
+                    file_path = self.save_records_to_file(normalized_gene, records)
+                    self.checkpoint_data[gene_name] = file_path
+                    self.save_checkpoint()
+                    return file_path
+                break
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                logging.error(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)
+        return None
+
+    def save_records_to_file(self, gene_name, records):
+        filename = f"{gene_name}.txt"
+        file_path = os.path.join(self.output_dir, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))
+        logging.info(f"Saved records for {gene_name} to {file_path}")
+        return file_path
+
+    def load_checkpoint(self):
+        if os.path.exists(self.checkpoint_file):
+            with open(self.checkpoint_file, 'r') as file:
+                return json.load(file)
+        return {}
+
+    def save_checkpoint(self):
+        with open(self.checkpoint_file, 'w') as file:
+            json.dump(self.checkpoint_data, file)
+
+def load_gene_names(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        return [line.strip() for line in file if line.strip()]
+
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Example Usage
+api_key = "19bea34a4dbdbc6ef30392cee15943365309"
+email = "lrmercadod@gmail.com"
+output_dir = "./gene_based_records/"
+downloader = GenePubMedDownloader(api_key, email, output_dir)
+
+# Load gene names and symbols
+full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
+symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')
+
+# Fetch records for each gene name and symbol
+for gene in full_names + symbols:
+    downloader.fetch_pubmed_data(gene)
diff --git a/code/step_1_data_collection_Luis_.py → code/step_1_data_collection_Luis_query.py b/code/step_1_data_collection_Luis_.py → code/step_1_data_collection_Luis_query.py