From f16f5d27a20bf950c8efb023161000013776a62d Mon Sep 17 00:00:00 2001
From: Luis Roberto Mercado Diaz <luis.mercado_diaz@uconn.edu>
Date: Wed, 8 May 2024 08:09:14 -0400
Subject: [PATCH] Genes and Query extraction

Implementation of an exogenous gene extraction method.
---
 .gitignore                                    |   1 +
 checkpoint.json                               |   1 +
 code/lib/Loading_PudMed.py                    |   3 +-
 .../Loading_PudMed.cpython-311.pyc            | Bin 11092 -> 11158 bytes
 code/step1_data_collection.py                 |   4 +-
 code/step1_data_collection_Luis_genes.py      |  94 ++++++++++++++++++
 ...y => step_1_data_collection_Luis_query.py} |   0
 7 files changed, 100 insertions(+), 3 deletions(-)
 create mode 100644 checkpoint.json
 create mode 100644 code/step1_data_collection_Luis_genes.py
 rename code/{step_1_data_collection_Luis_.py => step_1_data_collection_Luis_query.py} (100%)

diff --git a/.gitignore b/.gitignore
index 8dc7703..7531fff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,4 @@ results/baseline_doc/pubmed.zinc.2.15.txt
 results/baseline_doc/zinc AND 2013\[Date\].2013.txt
 /results
 config.ini
+/gene_based_records
diff --git a/checkpoint.json b/checkpoint.json
new file mode 100644
index 0000000..c4d6d57
--- /dev/null
+++ b/checkpoint.json
@@ -0,0 +1 @@
+{"tumor protein p53": "./gene_based_records/tumor_protein_p53.txt", "epidermal growth factor receptor": "./gene_based_records/epidermal_growth_factor_receptor.txt", "apolipoprotein E": "./gene_based_records/apolipoprotein_E.txt", "tumor necrosis factor": "./gene_based_records/tumor_necrosis_factor.txt", "vascular endothelial growth factor A": "./gene_based_records/vascular_endothelial_growth_factor_A.txt", "interleukin 6": "./gene_based_records/interleukin_6.txt", "transforming growth factor beta 1": "./gene_based_records/transforming_growth_factor_beta_1.txt", "methylenetetrahydrofolate reductase": "./gene_based_records/methylenetetrahydrofolate_reductase.txt", "hypoxia inducible factor 1 subunit alpha": "./gene_based_records/hypoxia_inducible_factor_1_subunit_alpha.txt", "erb-b2 receptor tyrosine kinase 2": "./gene_based_records/erb-b2_receptor_tyrosine_kinase_2.txt", "estrogen receptor 1": "./gene_based_records/estrogen_receptor_1.txt", "interleukin 10": "./gene_based_records/interleukin_10.txt", "amyloid beta precursor protein": "./gene_based_records/amyloid_beta_precursor_protein.txt", "signal transducer and activator of transcription 3": "./gene_based_records/signal_transducer_and_activator_of_transcription_3.txt", "BRCA1 DNA repair associated": "./gene_based_records/BRCA1_DNA_repair_associated.txt", "angiotensin I converting enzyme": "./gene_based_records/angiotensin_I_converting_enzyme.txt", "KRAS proto-oncogene, GTPase": "./gene_based_records/KRAS_proto-oncogene,_GTPase.txt", "brain derived neurotrophic factor": "./gene_based_records/brain_derived_neurotrophic_factor.txt", "B-Raf proto-oncogene, serine/threonine kinase": "./gene_based_records/B-Raf_proto-oncogene,_serine_threonine_kinase.txt", "matrix metallopeptidase 9": "./gene_based_records/matrix_metallopeptidase_9.txt", "vitamin D receptor": "./gene_based_records/vitamin_D_receptor.txt", "C-reactive protein": "./gene_based_records/C-reactive_protein.txt", "CD274 molecule": "./gene_based_records/CD274_molecule.txt", "androgen receptor": "./gene_based_records/androgen_receptor.txt", "adiponectin, C1Q and collagen domain containing": "./gene_based_records/adiponectin,_C1Q_and_collagen_domain_containing.txt", "AKT serine/threonine kinase 1": "./gene_based_records/AKT_serine_threonine_kinase_1.txt", "ATP binding cassette subfamily B member 1": "./gene_based_records/ATP_binding_cassette_subfamily_B_member_1.txt", "nuclear factor kappa B subunit 1": "./gene_based_records/nuclear_factor_kappa_B_subunit_1.txt", "interleukin 1 beta": "./gene_based_records/interleukin_1_beta.txt", "major histocompatibility complex, class II, DR beta 1": "./gene_based_records/major_histocompatibility_complex,_class_II,_DR_beta_1.txt"}
\ No newline at end of file
diff --git a/code/lib/Loading_PudMed.py b/code/lib/Loading_PudMed.py
index 31104b5..2ea1bb4 100644
--- a/code/lib/Loading_PudMed.py
+++ b/code/lib/Loading_PudMed.py
@@ -12,6 +12,7 @@
 from io import StringIO
 import time
 sys.path.append('lib')
+from http.client import IncompleteRead
 
 from Bio import Medline
 import os
@@ -216,7 +217,7 @@ def search_full(self, ix, data_dir, search_results, starting, count, batch):
             data = fetch_handle.read()
             fetch_handle.close()
             out_handle.write(data)
-            time.sleep(2)  # Delay between each batch fetch to respect the API rate limit
+            time.sleep(5)  # Delay between each batch fetch to respect the API rate limit
         out_handle.close()
         
     def fetch_rec(self, rec_id, entrez_handle):
diff --git a/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc b/code/lib/__pycache__/Loading_PudMed.cpython-311.pyc
index 8f0ff861105a8459c5fdcf4f477c92fa227c55e1..5762f6d72098af2798e59043224cd2bbb654b163 100644
GIT binary patch
delta 1422
zcmY+EO>7%Q6vubg-%gy^jvJ?G+B(EZydi`@OOa>@{ZP_I6bdf+kX2+Y-f_Iqez<Sf
zl@ktapn^&u)M_LUlmk?#2vreMeL~zg^~j+TXe0zzj&MMLK;peMq+#U!`T5Ozvv2<M
z#t&BBUrl|UN+lH>)Ai%^wVSE^?5~@X*Yj~JW%pJ3m|`$PwfZals=^~1X~Cj{>L(fT
zOD1>BNZetHau=WOXtPSTGN3AF6eD>}F?#N(IEznrRC0!TLD71}2($b9mYkYvH!be-
zGS`jHTkJK)dd1J`mn`4;Fti<s9>Od*C<=)(`n|-$NQxYNgbX1|NF%fi42TDbQI;0J
zB|c+0aVt5&3Y~9~msB>`Ihs1H9vi0E65#;h5yYa#Wc;#jnTGCj7$bL)K;^<XVM6Tm
zAACV0^KrryVHTnF!T{MpK}X1<GGGoNV_wU)b=ZVw$u&=Sj&Oo-QsmQzvB0O(N7-7(
zN-r=h@LuL*BufR#=#p{H^n9tFE5o9Q<crBO=-yHjC*Z6&lAkEYX_26e$0$Amy0gyV
z3GzOP(6T{YGIFWKv4WuT7isc5;U&V$gjWb<!Ub_RU&VrYhGv=HnH&0=6#IR+!mg0|
zGK|XDGTHKY1M6~OQ;S20@@Q!p;~v+c)_5q4Of0l(yy=^+BV$YF7XVz4A<qXb+Er|A
zH#z!mF3BVr_oDb+2nOWP9ED}%)_&9Qs?C;Rb7Q8PP({-+9sX(O*O6^?qekiBCY~V}
zXlh{y+J=)mPABVcHaXOTsCvt?OdFe3i|mPt0k?t{a51PG89D8?9A8Vy1nwNFBm!vS
zhr>hDf%facYL0Gm*dQ}xVsLEF27Ovd6QhNvP}<c(iS3B1g`ZGaee?sBEs8(JmZFms
zQxj*4lkAXa7LS)B=+Lq<>^8Y0RoBB^>3$<PS7=eNN1zwNWx^^#hHTS;8Tv`bBBQ*{
z{oMl*e;0?xs8OYAwgWZvs$l}O$rbl}9emSShb{5U_%arHW4y%fcfJ~Tuqj1p^41m7
z7J`<$O4c^v4Fn!$_imEoO~N(8+l1=~E!1t2_sBttq29rLxrnw*amNR~Rc+{wVR86~
zf?h><B*4Q4fEBtDTe&?A5bUiAy!NBGSX#y+@0A{%-$(hU2+tFA0<{jjPWYIxg7`v#
zL9(g0Ao(8##R2VhPcT6~ZJ)TW%}(E->30diE$eCcesiW~nN)ZT-7<8_bi2a5BeGLP
uEm-e1CG5WA&-k3}8ZC>@!wyc+KcuHHQJrdMzgH%{{r^XNH?<k2)PDdZ_)VGs

delta 1342
zcmYk5-)|d55XW!t?DLQ08pnx~CXH!I?8MjdBSi$MYH@$bt%6!g-MT>r$-Vg2%^}}8
znLC4>mym)06+soPRD_UFTcK8{2p;@^5Ptwdyc7uuaSy0ZJR*4IiJ83$veJESZ)SGy
zyEC)5R(`o++&7H0g5x*+NbB&9F~$DcS@>W`Z5wq%Re1c3J%a6-P;ZZj?O1Lpd5<Z|
z1AL;<WA#jZR8`I@cIsWlPTy0}i%&G_SzEoNm_wq(+`)a;U~EVvv~SqN;9G4wuIDid
zW<)tvMf*;wQXVFQLC6qB2_pz|6tegVhkb5AbFIN$%V~!OaJTcO)owVR&3iB+#?xQ0
zG4Wk`p6wqzN^htvH>et?)fc8{p9R7+;Q-={$-*SAb!;o(@Fbb%2qYZ}grfLqB!9j{
z(`N`K;aP+^EX9Bn`y5$GA{<483BT*Q7WCm5nO-E63C9U1#8PHI3an+GXDfpnnF>RJ
z4@XbNvm`Lom+f<oAB5_;DpbVWWFdV9&6{fK1e_HYC+DjP+C<8DisIv7dDl2BlJzjc
z%tlFyZ*(~-kjl@K^JT&X!YhPJgo}i#_;Ip<g3P^(%pcVEeoq;j*jHyPFE(cKP1+(%
z@F?m3Cm>8!dQH9_I2|ucEMKSqbRg9H08Bl~%nEHT7weDLQaVD$eI4Jka=;h`T^4`O
zJi%@cGWl(FYn6gx>sS$J;?#_RY!I8wNvc!OU+2JORIA%=J1(}N8Q&F^0N$1b^)M)k
zjGyjwy}(R|sjknVfyH(sX|X=`QkalddlfVVYa5-hCI8@AE(eFEnu8l8b{);9-Am&5
z>=GugJ6mF3iO**rVeYmLe4?_7m@F*o^AxixT7?Bn=EsGjRq8A=8^${8+zZu?k0-T)
zHCZ_bC>RLzL+BE&AVSS`Jb0QuHTu+fiwDiM$E6VaiqmuQUZ_q_X2@#T4!~O!n)Cw;
z0>`@seer6shC;tCme>!2UyB~oa~RRo#LxM0(O9@H6A!Yxw&+)H6Sfgp&*+Mh;X2_R
z!h3}G5vC?(!AE4E#>8Tf9(WBA8r%zjZ#LE}&u(-0lpL=k{5WD)57flf(iHn$Y?f*)
zCmxm#mB%UcB;h5(Rl;S$CgC$e4e_M{bdac%QZysv#1ZrEkaW<eIVS!v7pHHM`$K}f
zJK8Cy6P@^mICZdK%GGzs{ojuNB#*fryW8evcz_OUHO2o^5jPL^#MGgFjN<+U^Qtl(

diff --git a/code/step1_data_collection.py b/code/step1_data_collection.py
index 863cfc6..5ff824a 100644
--- a/code/step1_data_collection.py
+++ b/code/step1_data_collection.py
@@ -12,7 +12,7 @@
 sys.path.append('lib')  
 from lib.Literature_Data_Collection import literature_data_collection
 
-years = 15
+years = 35
 
 if len(sys.argv)>3:
     word_query = str(sys.argv[1])
@@ -52,7 +52,7 @@
 ld.gene_based_query_fit(query_size, query_full, query_symbol) # setting up
 
 g2d_starting_point = 0 
-batch_size = 1000
+batch_size = 100
 #############################
 #####################
 gene_end_point = round(query_size/batch_size)
diff --git a/code/step1_data_collection_Luis_genes.py b/code/step1_data_collection_Luis_genes.py
new file mode 100644
index 0000000..c3ad685
--- /dev/null
+++ b/code/step1_data_collection_Luis_genes.py
@@ -0,0 +1,94 @@
+import requests
+from Bio import Entrez
+from io import BytesIO
+import time
+import os
+import logging
+import json
+
+class GenePubMedDownloader:
+    def __init__(self, api_key, email, output_dir, max_records_per_query=9999, checkpoint_file="checkpoint.json"):
+        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        self.api_key = api_key
+        self.email = email
+        self.max_records_per_query = max_records_per_query
+        Entrez.email = email  # Set email for NCBI E-utilities
+        self.output_dir = output_dir
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.checkpoint_file = checkpoint_file
+        self.checkpoint_data = self.load_checkpoint()
+
+    def fetch_pubmed_data(self, gene_name):
+        if gene_name in self.checkpoint_data:
+            logging.info(f"Skipping {gene_name} (already processed)")
+            return self.checkpoint_data[gene_name]
+
+        normalized_gene = gene_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
+        records = []
+        attempt = 0
+        max_attempts = 5
+        while attempt < max_attempts:
+            try:
+                search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
+                search_response = requests.get(search_url, timeout=10)
+                if search_response.status_code == 200:
+                    search_results = Entrez.read(BytesIO(search_response.content))
+                    webenv = search_results['WebEnv']
+                    query_key = search_results['QueryKey']
+                    count = int(search_results['Count'])
+                    logging.info(f"Total records found for {gene_name}: {count}")
+                    if count > 0:
+                        for start in range(0, count, self.max_records_per_query):
+                            fetch_url = f"{self.base_url}efetch.fcgi?db=pubmed&rettype=medline&retmode=text&retstart={start}&retmax=min(count - start, self.max_records_per_query)&webenv={webenv}&query_key={query_key}&api_key={self.api_key}"
+                            fetch_response = requests.get(fetch_url, timeout=10)
+                            records.append(fetch_response.text)
+                            logging.info(f"Fetched records for {gene_name} starting from {start}")
+                    file_path = self.save_records_to_file(normalized_gene, records)
+                    self.checkpoint_data[gene_name] = file_path
+                    self.save_checkpoint()
+                    return file_path
+                break
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                logging.error(f"Attempt {attempt}: An error occurred: {e}")
+                time.sleep(2 ** attempt)
+        return []
+
+    def save_records_to_file(self, gene_name, records):
+        filename = f"{gene_name}.txt"
+        file_path = os.path.join(self.output_dir, filename)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write("\n".join(records))
+        logging.info(f"Saved records for {gene_name} to {file_path}")
+        return file_path
+
+    def load_checkpoint(self):
+        if os.path.exists(self.checkpoint_file):
+            with open(self.checkpoint_file, 'r') as file:
+                return json.load(file)
+        return {}
+
+    def save_checkpoint(self):
+        with open(self.checkpoint_file, 'w') as file:
+            json.dump(self.checkpoint_data, file)
+
+def load_gene_names(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        return [line.strip() for line in file if line.strip()]
+
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Example Usage
+api_key = "19bea34a4dbdbc6ef30392cee15943365309"
+email = "lrmercadod@gmail.com"
+output_dir = "./gene_based_records/"
+downloader = GenePubMedDownloader(api_key, email, output_dir)
+
+# Load gene names and symbols
+full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
+symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')
+
+# Fetch records for each gene name and symbol
+for gene in full_names + symbols:
+    downloader.fetch_pubmed_data(gene)
\ No newline at end of file
diff --git a/code/step_1_data_collection_Luis_.py b/code/step_1_data_collection_Luis_query.py
similarity index 100%
rename from code/step_1_data_collection_Luis_.py
rename to code/step_1_data_collection_Luis_query.py