Merge pull request #1 from lrm22005/Luis

Codes update
lrm22005 · May 6, 2024 · e107ed9 · e107ed9
2 parents ae973fe + abfaa4d
commit e107ed9
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 142 deletions.
diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py
@@ -33,9 +33,9 @@
 ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)
 
 ########### word query based literature data collection ################# 
-gap=10000
-batch = 1000
-w2d_starting_point = 0  
+gap=9000
+batch = 400
+w2d_starting_point = 2  
 
 search_results, _word_end_point = ld.word_based_query_fit(year = years, user_term=word_query)
 print('The number of avaliable abstracts :', _word_end_point, 'for ', word_query) 

diff --git a/code/step_1_data_collection_Luis.py b/code/step_1_data_collection_Luis.py
@@ -1,122 +1,107 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Jun 21 00:16:25 2020
-Updated to include robust retry mechanism and API rate limiting
-"""
-
-import os
-import pathlib
-import sys
-import time
-import urllib.error
-
-# Ensuring the correct append path for 'lib'
-sys.path.append(os.path.join(os.path.abspath(os.getcwd()), 'lib'))
-from lib.Loading_PudMed import ids_pudmed as pudmed
-
-class literature_data_collection:
-    def __init__(self, email, output_dir, document_output_dir, api_key=None):
-        self.output_dir = output_dir
-        self.document_output_dir = document_output_dir
-        self.email = email
+import requests
+from Bio import Entrez
+from io import StringIO
+from Bio import Medline
+from io import BytesIO
+
+class PubMedDownloader:
+    def __init__(self, api_key, email):
+        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
         self.api_key = api_key
-        print("Initialized literature_data_collection with email: {}".format(email))
-
-    def text_open(self, path):
-        with open(path, 'r') as f:
-            data = f.read().strip().split('\n')
-        return data
-
-    def word_based_query_fit(self, year=None, user_term="heart"):
-        pud = pudmed()
-        print("Created pudmed instance for searching.")
-        search_results, end_point = pud.search_list(user_term, year, self.email)
-        return search_results, end_point
-
-    def collecting_doc_using_word_based_query(self, year=None, user_term="heart", gap=50000, starting=0, ixs=0, test_end_point=0):
-        pud = pudmed()
-        print("Collecting documents using word-based query.")
-        search_results, end_point = pud.search_list(user_term, year, self.email)
-        if test_end_point != 0:
-            end_point = test_end_point
-        print('Checking data collection performance --- collecting until', end_point, 'documents')
-        next_start = starting
-        for ix in range(ixs, round(end_point/gap) + 1):
-            next_start = self.robust_request(ix, gap, next_start, end_point, 10000, pud, search_results)
-            if next_start >= end_point:
-                break
+        self.email = email
+        Entrez.email = email  # Setting email for Biopython Entrez
 
-    def robust_request(self, ix, gap, starting, end_point, batch, pud, search_results):
-        success = False
-        attempts = 0
-        while not success and attempts < 5:
+    def fetch_pubmed_data(self, query, batch_size=10000):
+        search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={query}&retmax=1&api_key={self.api_key}&usehistory=y"
+        search_response = requests.get(search_url)
+        if search_response.status_code == 200:
             try:
-                print(f"{ix} / {end_point // gap} | from {starting} to {min(starting + gap, end_point)}")
-                pud.search_full(ix, self.output_dir, search_results, starting, min(starting + gap, end_point), batch)
-                success = True
-            except urllib.error.HTTPError as e:
-                attempts += 1
-                wait_time = 2 ** attempts
-                print(f"An HTTP error occurred: {e}")
-                print(f"Retrying in {wait_time} seconds...")
-                time.sleep(wait_time)
-
-        if not success:
-            print("Failed after 5 attempts, skipping this batch.")
-        return starting + gap  # Returns the next starting point
-
-if __name__ == "__main__":
-    if len(sys.argv) > 3:
-        word_query = str(sys.argv[1])
-        word_end_point = int(sys.argv[2])
-        gene_end_point = int(sys.argv[3])
-        paths = str(sys.argv[4]) + '/'
-    elif len(sys.argv) == 3:
-        word_query = str(sys.argv[1])
-        paths = str(sys.argv[2]) + '/'
-
-    data_dir = os.path.abspath(os.getcwd())
-    output_dir = os.path.join(data_dir, paths + 'baseline_doc')
-    document_output_dir = os.path.join(data_dir, paths + 'gene2document')
-    os.makedirs(output_dir, exist_ok=True)
-    os.makedirs(document_output_dir, exist_ok=True)
-
-    email = "lrmercadod@gmail.com"  # Replace with your valid email address
-    api_key = "19bea34a4dbdbc6ef30392cee15943365309"
-    ld = literature_data_collection(email, output_dir, document_output_dir, api_key=api_key)
-
-    gap = 50000  # Adjust as needed
-    batch = 10000  # Adjust as needed
-    w2d_starting_point = 0  # Adjust if resuming from a different point
-
-    try:
-        search_results, word_end_point = ld.word_based_query_fit(year=None, user_term=word_query)
-        print('The number of available abstracts:', word_end_point, 'for', word_query)
-
-        if int(sys.argv[2]) == 0:
-            word_end_point = word_end_point
-
-        ld.collecting_doc_using_word_based_query(year=None, user_term=word_query, gap=gap, starting=gap*w2d_starting_point, ixs=w2d_starting_point, test_end_point=word_end_point)
-    except urllib.error.HTTPError as e:
-        print(f"An HTTP error occurred: {e}")
-        print("Retrying in 5 seconds...")
-        time.sleep(5)
-
-    # Assuming gene data is prepared and ready to be processed
-    try:
-        query_full = ld.text_open('data/gene_name_info/query_full_name.txt')  # Adjust path as necessary
-        query_symbol = ld.text_open('data/gene_name_info/query_symbol.txt')  # Adjust path as necessary
-        query_size = len(query_full)
-        ld.gene_based_query_fit(query_size, query_full, query_symbol)
-
-        g2d_starting_point = 0
-        batch_size = 10
-        gene_end_point = round(query_size / batch_size)
-        if len(sys.argv) > 2:
-            gene_end_point = int(sys.argv[3])
-        if int(sys.argv[3]) == 0:
-            gene_end_point = round(query_size / batch_size)
-
-        ld.collecting_doc_using_gene_based_query(year=None, batch_size=batch_size, starting=g2d_starting_point, query_len=query_size, end_point=gene_end_point)
-    except Exception as e:
-        print(f"Error during gene-based data collection: {e}")
+                # Use BytesIO for binary data
+                search_results = Entrez.read(BytesIO(search_response.content))
+                webenv = search_results['WebEnv']
+                query_key = search_results['QueryKey']
+                count = int(search_results['Count'])
+                print(f"Total records found: {count}")
+            except Exception as e:
+                print("Error reading search results:", e)
+                return []
+        else:
+            print("Failed to retrieve search results")
+            return []
+
+        records = []
+        for start in range(0, count, batch_size):
+            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&retstart={start}&retmax={batch_size}&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+            fetch_response = requests.get(fetch_url)
+            if fetch_response.status_code == 200:
+                records.extend(fetch_response.content.decode('utf-8').split('\n\n'))  # Each record separated by two newlines
+                print(f"Fetched {start + batch_size} of {count} records")
+            else:
+                print(f"Failed to fetch data for batch starting at {start}")
+
+        return records
+
+class ids_pubmed():
+    def __init__(self):
+        self.snp_ids = []
+        self.uids = []
+        self.gene_names = []
+        self.names = []
+        self.records = []
+        self.gene_full_names = []
+        self.saved_snp_id = []
+
+    def search_ids(self, search_email):
+        removal_index = []
+        Entrez.email = search_email
+        records = []
+        for snp_id in self.snp_ids:
+            record = Entrez.read(Entrez.elink(dbfrom="snp", 
+                                  id=snp_id.replace('rs', ''), 
+                                  db="gene")) 
+            if record[0]['LinkSetDb'] == []:
+                removal_index.append(snp_id)
+                print("index is removed: ", snp_id)
+
+            else:
+                results = record[0]['LinkSetDb'][0]['Link']
+                multi_gene = []
+                multi_full_name = []
+                multi_uid = []
+                for result in results:
+                    uid = result['Id']
+                    handle = Entrez.esummary(db="gene", id=uid)
+                    uid_record = Entrez.read(handle)
+
+                    records.append(uid_record)
+                    handle.close()
+                    uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0]
+                    gene_name = uid_summary['Name']
+                    gene_full_name = uid_summary['Description']
+                    if len(results) > 1:
+                        multi_gene.append(gene_name)
+                        multi_full_name.append(gene_full_name)
+                        multi_uid.append(uid)
+                    else:
+                        multi_gene = gene_name
+                        multi_full_name = gene_full_name
+                        multi_uid = uid
+
+                if len(results) > 1:
+                    multi_uid = "#".join(multi_uid)
+                    multi_gene = "#".join(multi_gene)
+                    multi_full_name = "#".join(multi_full_name)
+
+                self.uids.append(multi_uid)
+                self.gene_names.append(multi_gene)
+                self.gene_full_names.append(multi_full_name)
+                self.saved_snp_id.append(snp_id)
+        return removal_index, records, self.uids, self.gene_names, self.gene_full_names
+
+# Example usage:
+api_key = "19bea34a4dbdbc6ef30392cee15943365309"  # Replace with your actual NCBI API key
+email = "lrmercadod@gmail.com"  # Replace with your email
+
+downloader = PubMedDownloader(api_key, email)
+topic = "zinc"  # Define the topic of interest
+pubmed_records = downloader.fetch_pubmed_data(topic, 10000)  # Adjust batch size as needed
diff --git a/code/step_1_data_collection_Luis_.py b/code/step_1_data_collection_Luis_.py
@@ -1,21 +1,43 @@
-from Bio import Entrez
+import requests
 import time
-def download_data(query, batch_size=1000, delay=1):
-    Entrez.email = "your.email@example.com"
-    handle = Entrez.esearch(db="pubmed", term=query, retmax=1000000)
-    record = Entrez.read(handle)
-    ids = record["IdList"]
-    total = len(ids)
-    print(f"Total number of records: {total}")
-    for i in range(0, total, batch_size):
-        print(f"Downloading records {i+1}-{min(i+batch_size, total)}")
-        ids_batch = ids[i:i+batch_size]
-        handle = Entrez.efetch(db="pubmed", id=",".join(ids_batch), rettype="medline", retmode="text")
-        data = handle.read()
-        # Do something with the data, e.g., save it to a file
-        with open("data.txt", "a", encoding='utf-8') as f:
-            f.write(data)
-        handle.close()
-        time.sleep(delay)
-
-download_data("zinc")
+
+def fetch_pubmed_data(query, max_results=1000000):
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+    api_key = "19bea34a4dbdbc6ef30392cee15943365309"  # Replace with your actual NCBI API key
+    search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax=10000&api_key={api_key}"
+    fetch_url = f"{base_url}efetch.fcgi?db=pubmed&rettype=abstract&retmode=text&api_key={api_key}"
+
+    # Perform the initial search
+    search_response = requests.get(search_url)
+    if search_response.status_code != 200:
+        print("Failed to retrieve data")
+        return
+
+    search_results = search_response.text
+    id_list = search_results.split('<IdList>')[1].split('</IdList>')[0]
+    id_list = id_list.strip().split()
+
+    print(f"Found {len(id_list)} records, fetching data...")
+
+    # Fetch details of all IDs
+    records = []
+    for start in range(0, len(id_list), 500):  # PubMed allows fetching up to 500 records at a time
+        end = min(start + 500, len(id_list))
+        ids = ','.join(id_list[start:end])
+        fetch_response = requests.get(f"{fetch_url}&id={ids}")
+        if fetch_response.status_code == 200:
+            records.append(fetch_response.text)
+        else:
+            print("Failed to fetch data for some records.")
+        time.sleep(0.5)  # to prevent hitting rate limit
+
+    return records
+
+# Example usage
+topic = "zinc"  
+downloaded_data = fetch_pubmed_data(topic)
+
+# Optionally, save the data to a file
+with open("pubmed_data.txt", "w") as file:
+    for record in downloaded_data:
+        file.write(record)
diff --git a/pubmed_data.txt b/pubmed_data.txt
diff --git a/results/baseline_doc/pubmed.zinc.0.full.txt b/results/baseline_doc/pubmed.zinc.0.full.txt
diff --git a/results/baseline_doc/pubmed.zinc.2.15.txt b/results/baseline_doc/pubmed.zinc.2.15.txt