Skip to content

commiting changes and uploading codes #6

Merged
merged 1 commit into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,5 @@ config.ini
/gene_based_records
checkpoint.json
checkpoint.json
checkpoint.json
*.txt
2 changes: 1 addition & 1 deletion checkpoint.json

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions code/lib/Literature_Data_Preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


# -*- coding: utf-8 -*-
"""
Created on Sun Jun 21 00:16:25 2020
Expand All @@ -11,8 +13,8 @@
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from io import StringIO
from sklearn.feature_extraction import stop_words
#from sklearn.feature_extraction import _stop_words as stop_words
# from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction import _stop_words as stop_words
import re
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
Expand Down Expand Up @@ -319,7 +321,7 @@ def doc_preprocessor(self, sentence, stem=False):
def making_doc_data(self, gene_list, name, dic):
preprocessed_dir=self.preprocessed_dir
counting=0
handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), "w")
handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), "w", encoding='utf-8')
if gene_list == None:
for i in range(len(dic)):
if counting==10000:
Expand Down
Binary file modified code/lib/__pycache__/Literature_Data_Preprocessing.cpython-312.pyc
Binary file not shown.
12 changes: 9 additions & 3 deletions code/step1_data_collection_Luis_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,14 @@ def fetch_pubmed_data(self, gene_name):
search_url = f"{self.base_url}esearch.fcgi?db=pubmed&term={gene_name}[Gene Name]&retmax=1&api_key={self.api_key}&usehistory=y"
search_response = requests.get(search_url, timeout=10)
if search_response.status_code == 200:
search_results = Entrez.read(BytesIO(search_response.content))
# Check if the response content is valid XML
try:
search_results = Entrez.read(BytesIO(search_response.content))
except Exception as e:
logging.error(f"XML parsing error: {e}")
logging.debug(f"Response content: {search_response.content.decode('utf-8')}")
raise e

if 'WebEnv' in search_results and 'QueryKey' in search_results:
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']
Expand Down Expand Up @@ -95,7 +102,6 @@ def load_gene_names(file_path):
# Load gene names and symbols
full_names = load_gene_names('./data/gene_name_info/query_full_name.txt')
symbols = load_gene_names('./data/gene_name_info/query_symbol.txt')

# Fetch records for each gene name and symbol
for gene in full_names + symbols:
downloader.fetch_pubmed_data(gene)
downloader.fetch_pubmed_data(gene)
153 changes: 153 additions & 0 deletions code/step2_data_preprocessing_Luis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import os
import pathlib
import sys
import unicodedata
import traceback
sys.path.append('lib')
import lib.Literature_Data_Preprocessing as ldp

def check_directory(path):
if not os.path.exists(path):
print(f"Directory does not exist: {path}")
return False
return True

def safe_read(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
try:
with open(file_path, 'r', encoding='latin-1') as f:
return f.read()
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return None
except Exception as e:
print(f"Unexpected error reading file {file_path}: {e}")
return None

def process_gene_docs_in_chunks(file_path, chunk_size=1000):
gene2doc = {}
with open(file_path, 'r', encoding='utf-8') as f:
current_gene = None
current_doc = []
for line in f:
if line.strip() == "":
if current_gene and current_doc:
gene2doc[current_gene] = ''.join(current_doc)
current_gene = None
current_doc = []
elif not current_gene:
current_gene = line.strip()
else:
current_doc.append(line)

if len(gene2doc) % chunk_size == 0 and len(gene2doc) > 0:
print(f"Processed {len(gene2doc)} genes...")
yield gene2doc
gene2doc = {}

if current_gene and current_doc:
gene2doc[current_gene] = ''.join(current_doc)

if gene2doc:
yield gene2doc

base = sys.argv[1]
output = sys.argv[2]

# Update paths to match your new structure
batch_dir = base
gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records')
baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc')
comb_dir = os.path.join(output, 'arranged')
preprocessed_dir = os.path.join(output, 'preprocessed')

print(f"Checking directories...")
print(f"batch_dir: {batch_dir}")
print(f"gene_based_dir: {gene_based_dir}")
print(f"baseline_doc_dir: {baseline_doc_dir}")
print(f"comb_dir: {comb_dir}")
print(f"preprocessed_dir: {preprocessed_dir}")

if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])):
sys.exit("One or more required directories do not exist. Please check the paths and try again.")

pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)

lp = ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir)

# Process gene-based documents
consolidated_file = os.path.join(comb_dir, 'consolidated_gene_docs.txt')

if not os.path.exists(consolidated_file):
print("Consolidated file not found. Starting consolidation process...")
gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')]
print(f"Found {len(gene_files)} gene-based documents.")

if not gene_files:
print("No gene-based documents found. Skipping this step.")
else:
with open(consolidated_file, 'w', encoding='utf-8') as outfile:
for i, file in enumerate(gene_files, 1):
try:
content = safe_read(os.path.join(gene_based_dir, file))
if content is not None:
content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('ascii')
outfile.write(content + '\n\n')
else:
print(f"Skipping file {file} due to reading error.")
except Exception as e:
print(f"Error processing file {file}: {e}")
print(traceback.format_exc())
if i % 1000 == 0:
print(f"Consolidating file {i}/{len(gene_files)}")
print("All gene-based documents consolidated.")
else:
print("Consolidated file found. Skipping consolidation process.")

print("Processing consolidated gene-based document...")
for gene2doc_chunk in process_gene_docs_in_chunks(consolidated_file):
print(f"Processing chunk with {len(gene2doc_chunk)} genes...")
doc_gene = list(gene2doc_chunk.keys())
lp.making_doc_data(doc_gene, 'consolidated_gene_docs', gene2doc_chunk)

# Process baseline documents
# print("Processing baseline documents...")
# baseline_files = [f for f in os.listdir(baseline_doc_dir) if f.endswith('_consolidated.txt')]

# if not baseline_files:
# print("No baseline documents found. Skipping this step.")
# else:
# print(f"Found {len(baseline_files)} baseline documents.")
# for baseline_file in baseline_files:
# print(f"Processing {baseline_file}...")
# baseline_content = safe_read(os.path.join(baseline_doc_dir, baseline_file))
# if baseline_content is None:
# print(f"Skipping {baseline_file} due to reading error.")
# continue
# baseline_content = unicodedata.normalize('NFKD', baseline_content).encode('ascii', 'ignore').decode('ascii')

# # Split the content into individual documents
# baseline_docs = baseline_content.split('\n\n') # Adjust the separator as needed

# output_name = baseline_file.replace('_consolidated.txt', '')
# full_handle = open(os.path.join(comb_dir, f'{output_name}.FullText.txt'), "w", encoding='utf-8')
# meta_handle = open(os.path.join(comb_dir, f'{output_name}.meta.txt'), "w", encoding='utf-8')

# total_FullText = []
# for doc in baseline_docs:
# FullText, Meta = lp.Medine_mapping(doc)
# total_FullText.append(FullText)
# full_handle.write(FullText + '\n')
# meta_handle.write(Meta + '\n')

# full_handle.close()
# meta_handle.close()

# print(f"Preprocessing baseline document: {output_name}")
# lp.making_doc_data(None, output_name, total_FullText)

print("All processing completed.")
111 changes: 111 additions & 0 deletions code/step2_data_preprocessing_Luis_Custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import pathlib
import sys
sys.path.append('lib')
import lib.Literature_Data_Preprocessing as ldp

def check_directory(path):
if not os.path.exists(path):
print(f"Directory does not exist: {path}")
return False
return True

def safe_read_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return None

base = sys.argv[1]
output = sys.argv[2]

# Update paths to match your new structure
batch_dir = base
gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records')
baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc')
comb_dir = os.path.join(base, 'arranged')
preprocessed_dir = os.path.join(output, 'preprocessed')

print(f"Checking directories...")
print(f"batch_dir: {batch_dir}")
print(f"gene_based_dir: {gene_based_dir}")
print(f"baseline_doc_dir: {baseline_doc_dir}")
print(f"comb_dir: {comb_dir}")
print(f"preprocessed_dir: {preprocessed_dir}")

if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])):
sys.exit("One or more required directories do not exist. Please check the paths and try again.")

pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)

lp = ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir)

# Process gene-based documents
print("Processing gene-based documents...")
gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')]
print(f"Found {len(gene_files)} gene-based documents.")

if not gene_files:
print("No gene-based documents found. Skipping this step.")
gene2doc = {}
else:
print(f"First few gene files: {gene_files[:5]}")
file_names = [os.path.join(gene_based_dir, f) for f in gene_files]
data_list = [safe_read_file(f) for f in file_names]
data_list = [d for d in data_list if d is not None] # Remove None values

if data_list:
arr_list = lp.combining_files(gene_files, data_list, ['FullText'], 3)
for i in range(len(gene_files)):
lp.Indexing(os.path.join(comb_dir, gene_files[i]), arr_list[gene_files[i]])

gene2doc = lp.gene2doc_mapping(arr_list[gene_files[0]])
else:
print("No valid gene-based documents found. Skipping this step.")
gene2doc = {}

# Process baseline documents
# print("Processing baseline documents...")
# baseline_files = [f for f in os.listdir(baseline_doc_dir) if f.endswith('_consolidated.txt')]

# if not baseline_files:
# print("No baseline documents found. Skipping this step.")
# else:
# print(f"Found {len(baseline_files)} baseline documents.")
# for baseline_file in baseline_files:
# print(f"Processing {baseline_file}...")
# baseline_content = safe_read_file(os.path.join(baseline_doc_dir, baseline_file))
# if baseline_content is None:
# continue

# # Split the content into individual documents
# baseline_docs = baseline_content.split('\n\n') # Adjust the separator as needed

# output_name = baseline_file.replace('_consolidated.txt', '')
# full_handle = open(os.path.join(comb_dir, f'{output_name}.FullText.txt'), "w", encoding='utf-8')
# meta_handle = open(os.path.join(comb_dir, f'{output_name}.meta.txt'), "w", encoding='utf-8')

# total_FullText = []
# for doc in baseline_docs:
# FullText, Meta = lp.Medine_mapping(doc)
# total_FullText.append(FullText)
# full_handle.write(FullText + '\n')
# meta_handle.write(Meta + '\n')

# full_handle.close()
# meta_handle.close()

# print(f"Preprocessing baseline document: {output_name}")
# lp.making_doc_data(None, output_name, total_FullText)

if gene2doc:
print("Preprocessing gene-based documents...")
doc_gene = list(gene2doc.keys())
lp.making_doc_data(doc_gene, gene_files[0], gene2doc)
else:
print("No gene-based documents to process.")

print("All processing completed.")
Loading