Skip to content

Commit

Permalink
commiting changes and uploading codes
Browse files Browse the repository at this point in the history
  • Loading branch information
lrm22005 committed Jul 25, 2024
1 parent 1613e69 commit 34e849f
Show file tree
Hide file tree
Showing 11 changed files with 775 additions and 8 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,5 @@ config.ini
/gene_based_records
checkpoint.json
checkpoint.json
checkpoint.json
*.txt
2 changes: 1 addition & 1 deletion checkpoint.json

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions code/lib/Literature_Data_Preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


# -*- coding: utf-8 -*-
"""
Created on Sun Jun 21 00:16:25 2020
Expand All @@ -11,8 +13,8 @@
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from io import StringIO
from sklearn.feature_extraction import stop_words
#from sklearn.feature_extraction import _stop_words as stop_words
# from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction import _stop_words as stop_words
import re
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
Expand Down Expand Up @@ -319,7 +321,7 @@ def doc_preprocessor(self, sentence, stem=False):
def making_doc_data(self, gene_list, name, dic):
preprocessed_dir=self.preprocessed_dir
counting=0
handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), "w")
handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), "w", encoding='utf-8')
if gene_list == None:
for i in range(len(dic)):
if counting==10000:
Expand Down
Binary file modified code/lib/__pycache__/Literature_Data_Preprocessing.cpython-312.pyc
Binary file not shown.
12 changes: 9 additions & 3 deletions code/step1_data_collection_Luis_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,14 @@ def fetch_pubmed_data(self, gene_name):
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
# Check if the response content is valid XML
try:
search_results = Entrez.read(BytesIO(search_response.content))
except Exception as e:
logging.error(f"XML parsing error: {e}")
logging.debug(f"Response content: {search_response.content.decode('utf-8')}")
raise e

if 'WebEnv' in search_results and 'QueryKey' in search_results:
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
Expand Down Expand Up @@ -95,7 +102,6 @@ def load_gene_names(file_path):
# Load gene names and symbols
full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')

# Fetch records for each gene name and symbol
for gene in full_names + symbols:
downloader.fetch_pubmed_data(gene)
downloader.fetch_pubmed_data(gene)
153 changes: 153 additions & 0 deletions code/step2_data_preprocessing_Luis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import os
import pathlib
import sys
import unicodedata
import traceback
sys.path.append('lib')
import lib.Literature_Data_Preprocessing as ldp

def check_directory(path):
if not os.path.exists(path):
print(f"Directory does not exist: {path}")
return False
return True

def safe_read(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
try:
with open(file_path, 'r', encoding='latin-1') as f:
return f.read()
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return None
except Exception as e:
print(f"Unexpected error reading file {file_path}: {e}")
return None

def process_gene_docs_in_chunks(file_path, chunk_size=1000):
gene2doc = {}
with open(file_path, 'r', encoding='utf-8') as f:
current_gene = None
current_doc = []
for line in f:
if line.strip() == "":
if current_gene and current_doc:
gene2doc[current_gene] = ''.join(current_doc)
current_gene = None
current_doc = []
elif not current_gene:
current_gene = line.strip()
else:
current_doc.append(line)

if len(gene2doc) % chunk_size == 0 and len(gene2doc) > 0:
print(f"Processed {len(gene2doc)} genes...")
yield gene2doc
gene2doc = {}

if current_gene and current_doc:
gene2doc[current_gene] = ''.join(current_doc)

if gene2doc:
yield gene2doc

base = sys.argv[1]
output = sys.argv[2]

# Update paths to match your new structure
batch_dir = base
gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records')
baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc')
comb_dir = os.path.join(output, 'arranged')
preprocessed_dir = os.path.join(output, 'preprocessed')

print(f"Checking directories...")
print(f"batch_dir: {batch_dir}")
print(f"gene_based_dir: {gene_based_dir}")
print(f"baseline_doc_dir: {baseline_doc_dir}")
print(f"comb_dir: {comb_dir}")
print(f"preprocessed_dir: {preprocessed_dir}")

if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])):
sys.exit("One or more required directories do not exist. Please check the paths and try again.")

pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)

lp = ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir)

# Process gene-based documents
consolidated_file = os.path.join(comb_dir, 'consolidated_gene_docs.txt')

if not os.path.exists(consolidated_file):
print("Consolidated file not found. Starting consolidation process...")
gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')]
print(f"Found {len(gene_files)} gene-based documents.")

if not gene_files:
print("No gene-based documents found. Skipping this step.")
else:
with open(consolidated_file, 'w', encoding='utf-8') as outfile:
for i, file in enumerate(gene_files, 1):
try:
content = safe_read(os.path.join(gene_based_dir, file))
if content is not None:
content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('ascii')
outfile.write(content + '\n\n')
else:
print(f"Skipping file {file} due to reading error.")
except Exception as e:
print(f"Error processing file {file}: {e}")
print(traceback.format_exc())
if i % 1000 == 0:
print(f"Consolidating file {i}/{len(gene_files)}")
print("All gene-based documents consolidated.")
else:
print("Consolidated file found. Skipping consolidation process.")

print("Processing consolidated gene-based document...")
for gene2doc_chunk in process_gene_docs_in_chunks(consolidated_file):
print(f"Processing chunk with {len(gene2doc_chunk)} genes...")
doc_gene = list(gene2doc_chunk.keys())
lp.making_doc_data(doc_gene, 'consolidated_gene_docs', gene2doc_chunk)

# Process baseline documents
# print("Processing baseline documents...")
# baseline_files = [f for f in os.listdir(baseline_doc_dir) if f.endswith('_consolidated.txt')]

# if not baseline_files:
# print("No baseline documents found. Skipping this step.")
# else:
# print(f"Found {len(baseline_files)} baseline documents.")
# for baseline_file in baseline_files:
# print(f"Processing {baseline_file}...")
# baseline_content = safe_read(os.path.join(baseline_doc_dir, baseline_file))
# if baseline_content is None:
# print(f"Skipping {baseline_file} due to reading error.")
# continue
# baseline_content = unicodedata.normalize('NFKD', baseline_content).encode('ascii', 'ignore').decode('ascii')

# # Split the content into individual documents
# baseline_docs = baseline_content.split('\n\n') # Adjust the separator as needed

# output_name = baseline_file.replace('_consolidated.txt', '')
# full_handle = open(os.path.join(comb_dir, f'{output_name}.FullText.txt'), "w", encoding='utf-8')
# meta_handle = open(os.path.join(comb_dir, f'{output_name}.meta.txt'), "w", encoding='utf-8')

# total_FullText = []
# for doc in baseline_docs:
# FullText, Meta = lp.Medine_mapping(doc)
# total_FullText.append(FullText)
# full_handle.write(FullText + '\n')
# meta_handle.write(Meta + '\n')

# full_handle.close()
# meta_handle.close()

# print(f"Preprocessing baseline document: {output_name}")
# lp.making_doc_data(None, output_name, total_FullText)

print("All processing completed.")
111 changes: 111 additions & 0 deletions code/step2_data_preprocessing_Luis_Custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import pathlib
import sys
sys.path.append('lib')
import lib.Literature_Data_Preprocessing as ldp

def check_directory(path):
if not os.path.exists(path):
print(f"Directory does not exist: {path}")
return False
return True

def safe_read_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return None

base = sys.argv[1]
output = sys.argv[2]

# Update paths to match your new structure
batch_dir = base
gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records')
baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc')
comb_dir = os.path.join(base, 'arranged')
preprocessed_dir = os.path.join(output, 'preprocessed')

print(f"Checking directories...")
print(f"batch_dir: {batch_dir}")
print(f"gene_based_dir: {gene_based_dir}")
print(f"baseline_doc_dir: {baseline_doc_dir}")
print(f"comb_dir: {comb_dir}")
print(f"preprocessed_dir: {preprocessed_dir}")

if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])):
sys.exit("One or more required directories do not exist. Please check the paths and try again.")

pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)

lp = ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir)

# Process gene-based documents
print("Processing gene-based documents...")
gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')]
print(f"Found {len(gene_files)} gene-based documents.")

if not gene_files:
print("No gene-based documents found. Skipping this step.")
gene2doc = {}
else:
print(f"First few gene files: {gene_files[:5]}")
file_names = [os.path.join(gene_based_dir, f) for f in gene_files]
data_list = [safe_read_file(f) for f in file_names]
data_list = [d for d in data_list if d is not None] # Remove None values

if data_list:
arr_list = lp.combining_files(gene_files, data_list, ['FullText'], 3)
for i in range(len(gene_files)):
lp.Indexing(os.path.join(comb_dir, gene_files[i]), arr_list[gene_files[i]])

gene2doc = lp.gene2doc_mapping(arr_list[gene_files[0]])
else:
print("No valid gene-based documents found. Skipping this step.")
gene2doc = {}

# Process baseline documents
# print("Processing baseline documents...")
# baseline_files = [f for f in os.listdir(baseline_doc_dir) if f.endswith('_consolidated.txt')]

# if not baseline_files:
# print("No baseline documents found. Skipping this step.")
# else:
# print(f"Found {len(baseline_files)} baseline documents.")
# for baseline_file in baseline_files:
# print(f"Processing {baseline_file}...")
# baseline_content = safe_read_file(os.path.join(baseline_doc_dir, baseline_file))
# if baseline_content is None:
# continue

# # Split the content into individual documents
# baseline_docs = baseline_content.split('\n\n') # Adjust the separator as needed

# output_name = baseline_file.replace('_consolidated.txt', '')
# full_handle = open(os.path.join(comb_dir, f'{output_name}.FullText.txt'), "w", encoding='utf-8')
# meta_handle = open(os.path.join(comb_dir, f'{output_name}.meta.txt'), "w", encoding='utf-8')

# total_FullText = []
# for doc in baseline_docs:
# FullText, Meta = lp.Medine_mapping(doc)
# total_FullText.append(FullText)
# full_handle.write(FullText + '\n')
# meta_handle.write(Meta + '\n')

# full_handle.close()
# meta_handle.close()

# print(f"Preprocessing baseline document: {output_name}")
# lp.making_doc_data(None, output_name, total_FullText)

if gene2doc:
print("Preprocessing gene-based documents...")
doc_gene = list(gene2doc.keys())
lp.making_doc_data(doc_gene, gene_files[0], gene2doc)
else:
print("No gene-based documents to process.")

print("All processing completed.")
Loading

0 comments on commit 34e849f

Please sign in to comment.