lrm22005 · lrm22005 · Jul 25, 2024 · Jul 25, 2024
diff --git a/.gitignore b/.gitignore
@@ -41,3 +41,5 @@ config.ini
 /gene_based_records
 checkpoint.json
 checkpoint.json
+checkpoint.json
+*.txt
diff --git a/checkpoint.json b/checkpoint.json
diff --git a/code/lib/Literature_Data_Preprocessing.py b/code/lib/Literature_Data_Preprocessing.py
@@ -1,3 +1,5 @@
+
+
 # -*- coding: utf-8 -*-
 """
 Created on Sun Jun 21 00:16:25 2020
@@ -11,8 +13,8 @@
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer
 from io import StringIO
-from sklearn.feature_extraction import stop_words
-#from sklearn.feature_extraction import _stop_words as stop_words
+# from sklearn.feature_extraction import _stop_words
+from sklearn.feature_extraction import _stop_words as stop_words
 import re 
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
@@ -319,7 +321,7 @@ def doc_preprocessor(self, sentence, stem=False):
     def making_doc_data(self, gene_list, name, dic):
         preprocessed_dir=self.preprocessed_dir
         counting=0
-        handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), "w")
+        handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), "w", encoding='utf-8')
         if gene_list == None:
             for i in range(len(dic)): 
                 if counting==10000:

diff --git a/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-312.pyc b/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-312.pyc
diff --git a/code/step1_data_collection_Luis_genes.py b/code/step1_data_collection_Luis_genes.py
@@ -32,7 +32,14 @@ def fetch_pubmed_data(self, gene_name):
                 search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
                 search_response = requests.get(search_url, timeout=10)
                 if search_response.status_code == 200:
-                    search_results = Entrez.read(BytesIO(search_response.content))
+                    # Check if the response content is valid XML
+                    try:
+                        search_results = Entrez.read(BytesIO(search_response.content))
+                    except Exception as e:
+                        logging.error(f"XML parsing error: {e}")
+                        logging.debug(f"Response content: {search_response.content.decode('utf-8')}")
+                        raise e
+
                     if 'WebEnv' in search_results and 'QueryKey' in search_results:
                         webenv = search_results['WebEnv']
                         query_key = search_results['QueryKey']
@@ -95,7 +102,6 @@ def load_gene_names(file_path):
 # Load gene names and symbols
 full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
 symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')
-
 # Fetch records for each gene name and symbol
 for gene in full_names + symbols:
-    downloader.fetch_pubmed_data(gene)
+    downloader.fetch_pubmed_data(gene)
diff --git a/code/step2_data_preprocessing_Luis.py b/code/step2_data_preprocessing_Luis.py
@@ -0,0 +1,153 @@
+import os
+import pathlib
+import sys
+import unicodedata
+import traceback
+sys.path.append('lib')  
+import lib.Literature_Data_Preprocessing as ldp
+
+def check_directory(path):
+    if not os.path.exists(path):
+        print(f"Directory does not exist: {path}")
+        return False
+    return True
+
+def safe_read(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    except UnicodeDecodeError:
+        try:
+            with open(file_path, 'r', encoding='latin-1') as f:
+                return f.read()
+        except Exception as e:
+            print(f"Error reading file {file_path}: {e}")
+            return None
+    except Exception as e:
+        print(f"Unexpected error reading file {file_path}: {e}")
+        return None
+
+def process_gene_docs_in_chunks(file_path, chunk_size=1000):
+    gene2doc = {}
+    with open(file_path, 'r', encoding='utf-8') as f:
+        current_gene = None
+        current_doc = []
+        for line in f:
+            if line.strip() == "":
+                if current_gene and current_doc:
+                    gene2doc[current_gene] = ''.join(current_doc)
+                current_gene = None
+                current_doc = []
+            elif not current_gene:
+                current_gene = line.strip()
+            else:
+                current_doc.append(line)
+
+            if len(gene2doc) % chunk_size == 0 and len(gene2doc) > 0:
+                print(f"Processed {len(gene2doc)} genes...")
+                yield gene2doc
+                gene2doc = {}
+
+        if current_gene and current_doc:
+            gene2doc[current_gene] = ''.join(current_doc)
+
+        if gene2doc:
+            yield gene2doc
+
+base = sys.argv[1]
+output = sys.argv[2]
+
+# Update paths to match your new structure
+batch_dir = base
+gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records')
+baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc')
+comb_dir = os.path.join(output, 'arranged')
+preprocessed_dir = os.path.join(output, 'preprocessed')
+
+print(f"Checking directories...")
+print(f"batch_dir: {batch_dir}")
+print(f"gene_based_dir: {gene_based_dir}")
+print(f"baseline_doc_dir: {baseline_doc_dir}")
+print(f"comb_dir: {comb_dir}")
+print(f"preprocessed_dir: {preprocessed_dir}")
+
+if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])):
+    sys.exit("One or more required directories do not exist. Please check the paths and try again.")
+
+pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
+pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)
+
+lp = ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir) 
+
+# Process gene-based documents
+consolidated_file = os.path.join(comb_dir, 'consolidated_gene_docs.txt')
+
+if not os.path.exists(consolidated_file):
+    print("Consolidated file not found. Starting consolidation process...")
+    gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')]
+    print(f"Found {len(gene_files)} gene-based documents.")
+
+    if not gene_files:
+        print("No gene-based documents found. Skipping this step.")
+    else:
+        with open(consolidated_file, 'w', encoding='utf-8') as outfile:
+            for i, file in enumerate(gene_files, 1):
+                try:
+                    content = safe_read(os.path.join(gene_based_dir, file))
+                    if content is not None:
+                        content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('ascii')
+                        outfile.write(content + '\n\n')
+                    else:
+                        print(f"Skipping file {file} due to reading error.")
+                except Exception as e:
+                    print(f"Error processing file {file}: {e}")
+                    print(traceback.format_exc())
+                if i % 1000 == 0:
+                    print(f"Consolidating file {i}/{len(gene_files)}")
+        print("All gene-based documents consolidated.")
+else:
+    print("Consolidated file found. Skipping consolidation process.")
+
+print("Processing consolidated gene-based document...")
+for gene2doc_chunk in process_gene_docs_in_chunks(consolidated_file):
+    print(f"Processing chunk with {len(gene2doc_chunk)} genes...")
+    doc_gene = list(gene2doc_chunk.keys())
+    lp.making_doc_data(doc_gene, 'consolidated_gene_docs', gene2doc_chunk)
+
+# Process baseline documents
+# print("Processing baseline documents...")
+# baseline_files = [f for f in os.listdir(baseline_doc_dir) if f.endswith('_consolidated.txt')]
+
+# if not baseline_files:
+#     print("No baseline documents found. Skipping this step.")
+# else:
+#     print(f"Found {len(baseline_files)} baseline documents.")
+#     for baseline_file in baseline_files:
+#         print(f"Processing {baseline_file}...")
+#         baseline_content = safe_read(os.path.join(baseline_doc_dir, baseline_file))
+#         if baseline_content is None:
+#             print(f"Skipping {baseline_file} due to reading error.")
+#             continue
+#         baseline_content = unicodedata.normalize('NFKD', baseline_content).encode('ascii', 'ignore').decode('ascii')
+
+#         # Split the content into individual documents
+#         baseline_docs = baseline_content.split('\n\n')  # Adjust the separator as needed
+
+#         output_name = baseline_file.replace('_consolidated.txt', '')
+#         full_handle = open(os.path.join(comb_dir, f'{output_name}.FullText.txt'), "w", encoding='utf-8')
+#         meta_handle = open(os.path.join(comb_dir, f'{output_name}.meta.txt'), "w", encoding='utf-8')
+
+#         total_FullText = []
+#         for doc in baseline_docs:
+#             FullText, Meta = lp.Medine_mapping(doc) 
+#             total_FullText.append(FullText)
+#             full_handle.write(FullText + '\n')
+#             meta_handle.write(Meta + '\n')
+
+#         full_handle.close()
+#         meta_handle.close()
+
+#         print(f"Preprocessing baseline document: {output_name}")
+#         lp.making_doc_data(None, output_name, total_FullText)
+
+print("All processing completed.")
diff --git a/code/step2_data_preprocessing_Luis_Custom.py b/code/step2_data_preprocessing_Luis_Custom.py
@@ -0,0 +1,111 @@
+import os
+import pathlib
+import sys
+sys.path.append('lib')  
+import lib.Literature_Data_Preprocessing as ldp
+
+def check_directory(path):
+    if not os.path.exists(path):
+        print(f"Directory does not exist: {path}")
+        return False
+    return True
+
+def safe_read_file(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    except Exception as e:
+        print(f"Error reading file {file_path}: {e}")
+        return None
+
+base = sys.argv[1]
+output = sys.argv[2]
+
+# Update paths to match your new structure
+batch_dir = base
+gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records')
+baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc')
+comb_dir = os.path.join(base, 'arranged')
+preprocessed_dir = os.path.join(output, 'preprocessed')
+
+print(f"Checking directories...")
+print(f"batch_dir: {batch_dir}")
+print(f"gene_based_dir: {gene_based_dir}")
+print(f"baseline_doc_dir: {baseline_doc_dir}")
+print(f"comb_dir: {comb_dir}")
+print(f"preprocessed_dir: {preprocessed_dir}")
+
+if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])):
+    sys.exit("One or more required directories do not exist. Please check the paths and try again.")
+
+pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
+pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)
+
+lp = ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir) 
+
+# Process gene-based documents
+print("Processing gene-based documents...")
+gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')]
+print(f"Found {len(gene_files)} gene-based documents.")
+
+if not gene_files:
+    print("No gene-based documents found. Skipping this step.")
+    gene2doc = {}
+else:
+    print(f"First few gene files: {gene_files[:5]}")
+    file_names = [os.path.join(gene_based_dir, f) for f in gene_files]
+    data_list = [safe_read_file(f) for f in file_names]
+    data_list = [d for d in data_list if d is not None]  # Remove None values
+
+    if data_list:
+        arr_list = lp.combining_files(gene_files, data_list, ['FullText'], 3)
+        for i in range(len(gene_files)):
+            lp.Indexing(os.path.join(comb_dir, gene_files[i]), arr_list[gene_files[i]])
+
+        gene2doc = lp.gene2doc_mapping(arr_list[gene_files[0]])
+    else:
+        print("No valid gene-based documents found. Skipping this step.")
+        gene2doc = {}
+
+# Process baseline documents
+# print("Processing baseline documents...")
+# baseline_files = [f for f in os.listdir(baseline_doc_dir) if f.endswith('_consolidated.txt')]
+
+# if not baseline_files:
+#     print("No baseline documents found. Skipping this step.")
+# else:
+#     print(f"Found {len(baseline_files)} baseline documents.")
+#     for baseline_file in baseline_files:
+#         print(f"Processing {baseline_file}...")
+#         baseline_content = safe_read_file(os.path.join(baseline_doc_dir, baseline_file))
+#         if baseline_content is None:
+#             continue
+
+#         # Split the content into individual documents
+#         baseline_docs = baseline_content.split('\n\n')  # Adjust the separator as needed
+
+#         output_name = baseline_file.replace('_consolidated.txt', '')
+#         full_handle = open(os.path.join(comb_dir, f'{output_name}.FullText.txt'), "w", encoding='utf-8')
+#         meta_handle = open(os.path.join(comb_dir, f'{output_name}.meta.txt'), "w", encoding='utf-8')
+
+#         total_FullText = []
+#         for doc in baseline_docs:
+#             FullText, Meta = lp.Medine_mapping(doc) 
+#             total_FullText.append(FullText)
+#             full_handle.write(FullText + '\n')
+#             meta_handle.write(Meta + '\n')
+
+#         full_handle.close()
+#         meta_handle.close()
+
+#         print(f"Preprocessing baseline document: {output_name}")
+#         lp.making_doc_data(None, output_name, total_FullText)
+
+if gene2doc:
+    print("Preprocessing gene-based documents...")
+    doc_gene = list(gene2doc.keys())
+    lp.making_doc_data(doc_gene, gene_files[0], gene2doc) 
+else:
+    print("No gene-based documents to process.")
+
+print("All processing completed.")