-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
commiting changes and uploading codes
- Loading branch information
Showing
11 changed files
with
775 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,3 +41,5 @@ config.ini | |
/gene_based_records | ||
checkpoint.json | ||
checkpoint.json | ||
checkpoint.json | ||
*.txt |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file modified
BIN
+45 Bytes
(100%)
code/lib/__pycache__/Literature_Data_Preprocessing.cpython-312.pyc
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
import os | ||
import pathlib | ||
import sys | ||
import unicodedata | ||
import traceback | ||
sys.path.append('lib') | ||
import lib.Literature_Data_Preprocessing as ldp | ||
|
||
def check_directory(path): | ||
if not os.path.exists(path): | ||
print(f"Directory does not exist: {path}") | ||
return False | ||
return True | ||
|
||
def safe_read(file_path): | ||
try: | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
return f.read() | ||
except UnicodeDecodeError: | ||
try: | ||
with open(file_path, 'r', encoding='latin-1') as f: | ||
return f.read() | ||
except Exception as e: | ||
print(f"Error reading file {file_path}: {e}") | ||
return None | ||
except Exception as e: | ||
print(f"Unexpected error reading file {file_path}: {e}") | ||
return None | ||
|
||
def process_gene_docs_in_chunks(file_path, chunk_size=1000): | ||
gene2doc = {} | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
current_gene = None | ||
current_doc = [] | ||
for line in f: | ||
if line.strip() == "": | ||
if current_gene and current_doc: | ||
gene2doc[current_gene] = ''.join(current_doc) | ||
current_gene = None | ||
current_doc = [] | ||
elif not current_gene: | ||
current_gene = line.strip() | ||
else: | ||
current_doc.append(line) | ||
|
||
if len(gene2doc) % chunk_size == 0 and len(gene2doc) > 0: | ||
print(f"Processed {len(gene2doc)} genes...") | ||
yield gene2doc | ||
gene2doc = {} | ||
|
||
if current_gene and current_doc: | ||
gene2doc[current_gene] = ''.join(current_doc) | ||
|
||
if gene2doc: | ||
yield gene2doc | ||
|
||
base = sys.argv[1] | ||
output = sys.argv[2] | ||
|
||
# Update paths to match your new structure | ||
batch_dir = base | ||
gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records') | ||
baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc') | ||
comb_dir = os.path.join(output, 'arranged') | ||
preprocessed_dir = os.path.join(output, 'preprocessed') | ||
|
||
print(f"Checking directories...") | ||
print(f"batch_dir: {batch_dir}") | ||
print(f"gene_based_dir: {gene_based_dir}") | ||
print(f"baseline_doc_dir: {baseline_doc_dir}") | ||
print(f"comb_dir: {comb_dir}") | ||
print(f"preprocessed_dir: {preprocessed_dir}") | ||
|
||
if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])): | ||
sys.exit("One or more required directories do not exist. Please check the paths and try again.") | ||
|
||
pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True) | ||
pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True) | ||
|
||
lp = ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir) | ||
|
||
# Process gene-based documents | ||
consolidated_file = os.path.join(comb_dir, 'consolidated_gene_docs.txt') | ||
|
||
if not os.path.exists(consolidated_file): | ||
print("Consolidated file not found. Starting consolidation process...") | ||
gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')] | ||
print(f"Found {len(gene_files)} gene-based documents.") | ||
|
||
if not gene_files: | ||
print("No gene-based documents found. Skipping this step.") | ||
else: | ||
with open(consolidated_file, 'w', encoding='utf-8') as outfile: | ||
for i, file in enumerate(gene_files, 1): | ||
try: | ||
content = safe_read(os.path.join(gene_based_dir, file)) | ||
if content is not None: | ||
content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('ascii') | ||
outfile.write(content + '\n\n') | ||
else: | ||
print(f"Skipping file {file} due to reading error.") | ||
except Exception as e: | ||
print(f"Error processing file {file}: {e}") | ||
print(traceback.format_exc()) | ||
if i % 1000 == 0: | ||
print(f"Consolidating file {i}/{len(gene_files)}") | ||
print("All gene-based documents consolidated.") | ||
else: | ||
print("Consolidated file found. Skipping consolidation process.") | ||
|
||
print("Processing consolidated gene-based document...") | ||
for gene2doc_chunk in process_gene_docs_in_chunks(consolidated_file): | ||
print(f"Processing chunk with {len(gene2doc_chunk)} genes...") | ||
doc_gene = list(gene2doc_chunk.keys()) | ||
lp.making_doc_data(doc_gene, 'consolidated_gene_docs', gene2doc_chunk) | ||
|
||
# Process baseline documents | ||
# print("Processing baseline documents...") | ||
# baseline_files = [f for f in os.listdir(baseline_doc_dir) if f.endswith('_consolidated.txt')] | ||
|
||
# if not baseline_files: | ||
# print("No baseline documents found. Skipping this step.") | ||
# else: | ||
# print(f"Found {len(baseline_files)} baseline documents.") | ||
# for baseline_file in baseline_files: | ||
# print(f"Processing {baseline_file}...") | ||
# baseline_content = safe_read(os.path.join(baseline_doc_dir, baseline_file)) | ||
# if baseline_content is None: | ||
# print(f"Skipping {baseline_file} due to reading error.") | ||
# continue | ||
# baseline_content = unicodedata.normalize('NFKD', baseline_content).encode('ascii', 'ignore').decode('ascii') | ||
|
||
# # Split the content into individual documents | ||
# baseline_docs = baseline_content.split('\n\n') # Adjust the separator as needed | ||
|
||
# output_name = baseline_file.replace('_consolidated.txt', '') | ||
# full_handle = open(os.path.join(comb_dir, f'{output_name}.FullText.txt'), "w", encoding='utf-8') | ||
# meta_handle = open(os.path.join(comb_dir, f'{output_name}.meta.txt'), "w", encoding='utf-8') | ||
|
||
# total_FullText = [] | ||
# for doc in baseline_docs: | ||
# FullText, Meta = lp.Medine_mapping(doc) | ||
# total_FullText.append(FullText) | ||
# full_handle.write(FullText + '\n') | ||
# meta_handle.write(Meta + '\n') | ||
|
||
# full_handle.close() | ||
# meta_handle.close() | ||
|
||
# print(f"Preprocessing baseline document: {output_name}") | ||
# lp.making_doc_data(None, output_name, total_FullText) | ||
|
||
print("All processing completed.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import os | ||
import pathlib | ||
import sys | ||
sys.path.append('lib') | ||
import lib.Literature_Data_Preprocessing as ldp | ||
|
||
def check_directory(path): | ||
if not os.path.exists(path): | ||
print(f"Directory does not exist: {path}") | ||
return False | ||
return True | ||
|
||
def safe_read_file(file_path): | ||
try: | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
return f.read() | ||
except Exception as e: | ||
print(f"Error reading file {file_path}: {e}") | ||
return None | ||
|
||
base = sys.argv[1] | ||
output = sys.argv[2] | ||
|
||
# Update paths to match your new structure | ||
batch_dir = base | ||
gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records') | ||
baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc') | ||
comb_dir = os.path.join(base, 'arranged') | ||
preprocessed_dir = os.path.join(output, 'preprocessed') | ||
|
||
print(f"Checking directories...") | ||
print(f"batch_dir: {batch_dir}") | ||
print(f"gene_based_dir: {gene_based_dir}") | ||
print(f"baseline_doc_dir: {baseline_doc_dir}") | ||
print(f"comb_dir: {comb_dir}") | ||
print(f"preprocessed_dir: {preprocessed_dir}") | ||
|
||
if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])): | ||
sys.exit("One or more required directories do not exist. Please check the paths and try again.") | ||
|
||
pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True) | ||
pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True) | ||
|
||
lp = ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir) | ||
|
||
# Process gene-based documents | ||
print("Processing gene-based documents...") | ||
gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')] | ||
print(f"Found {len(gene_files)} gene-based documents.") | ||
|
||
if not gene_files: | ||
print("No gene-based documents found. Skipping this step.") | ||
gene2doc = {} | ||
else: | ||
print(f"First few gene files: {gene_files[:5]}") | ||
file_names = [os.path.join(gene_based_dir, f) for f in gene_files] | ||
data_list = [safe_read_file(f) for f in file_names] | ||
data_list = [d for d in data_list if d is not None] # Remove None values | ||
|
||
if data_list: | ||
arr_list = lp.combining_files(gene_files, data_list, ['FullText'], 3) | ||
for i in range(len(gene_files)): | ||
lp.Indexing(os.path.join(comb_dir, gene_files[i]), arr_list[gene_files[i]]) | ||
|
||
gene2doc = lp.gene2doc_mapping(arr_list[gene_files[0]]) | ||
else: | ||
print("No valid gene-based documents found. Skipping this step.") | ||
gene2doc = {} | ||
|
||
# Process baseline documents | ||
# print("Processing baseline documents...") | ||
# baseline_files = [f for f in os.listdir(baseline_doc_dir) if f.endswith('_consolidated.txt')] | ||
|
||
# if not baseline_files: | ||
# print("No baseline documents found. Skipping this step.") | ||
# else: | ||
# print(f"Found {len(baseline_files)} baseline documents.") | ||
# for baseline_file in baseline_files: | ||
# print(f"Processing {baseline_file}...") | ||
# baseline_content = safe_read_file(os.path.join(baseline_doc_dir, baseline_file)) | ||
# if baseline_content is None: | ||
# continue | ||
|
||
# # Split the content into individual documents | ||
# baseline_docs = baseline_content.split('\n\n') # Adjust the separator as needed | ||
|
||
# output_name = baseline_file.replace('_consolidated.txt', '') | ||
# full_handle = open(os.path.join(comb_dir, f'{output_name}.FullText.txt'), "w", encoding='utf-8') | ||
# meta_handle = open(os.path.join(comb_dir, f'{output_name}.meta.txt'), "w", encoding='utf-8') | ||
|
||
# total_FullText = [] | ||
# for doc in baseline_docs: | ||
# FullText, Meta = lp.Medine_mapping(doc) | ||
# total_FullText.append(FullText) | ||
# full_handle.write(FullText + '\n') | ||
# meta_handle.write(Meta + '\n') | ||
|
||
# full_handle.close() | ||
# meta_handle.close() | ||
|
||
# print(f"Preprocessing baseline document: {output_name}") | ||
# lp.making_doc_data(None, output_name, total_FullText) | ||
|
||
if gene2doc: | ||
print("Preprocessing gene-based documents...") | ||
doc_gene = list(gene2doc.keys()) | ||
lp.making_doc_data(doc_gene, gene_files[0], gene2doc) | ||
else: | ||
print("No gene-based documents to process.") | ||
|
||
print("All processing completed.") |
Oops, something went wrong.