diff --git a/.gitignore b/.gitignore index 704add0..8fc38da 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,7 @@ output/embeddings/gene_based/gene_based_embeddings_part1.pt output/embeddings/gene_based/gene_based_embeddings_part0.pt output/embeddings/gene_based/gene_based_embeddings_part2.pt output/gene_based_preprocessed/processed_genes.json +output/embeddings/gene_based/gene_based_embeddings_part0.pt +output/embeddings/gene_based/gene_based_embeddings_part1.pt +output/embeddings/gene_based/gene_based_embeddings_part2.pt +output/gene_based_preprocessed/processed_genes.json diff --git a/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-311.pyc b/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-311.pyc new file mode 100644 index 0000000..98ccd8f Binary files /dev/null and b/code/lib/__pycache__/Literature_Data_Preprocessing.cpython-311.pyc differ diff --git a/code/step2_data_preprocessing_Luis_new.py b/code/step2_data_preprocessing_Luis_new.py index 0b32095..2250b4d 100644 --- a/code/step2_data_preprocessing_Luis_new.py +++ b/code/step2_data_preprocessing_Luis_new.py @@ -28,7 +28,7 @@ def safe_read(file_path): print(f"Unexpected error reading file {file_path}: {e}") return None -def process_gene_docs_in_chunks(file_path, chunk_size=1000): +def process_gene_docs_in_chunks(file_path, chunk_size=10000): gene2doc = {} with open(file_path, 'r', encoding='utf-8') as f: current_gene = None @@ -75,7 +75,7 @@ def making_doc_data(self, gene_list, name, dic, mode='w'): else: for i in range(len(gene_list)): - if counting == 1000: + if counting == 10000: print(i, '/', len(gene_list)) counting = 0 data = dic[gene_list[i]] @@ -93,7 +93,7 @@ def making_doc_data(self, gene_list, name, dic, mode='w'): gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records') baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc') comb_dir = os.path.join(output, 'arranged') -preprocessed_dir = os.path.join(output, 'preprocessed') +preprocessed_dir = os.path.join(output, 'gene_based_preprocessed') print(f"Checking directories...") print(f"batch_dir: {batch_dir}") diff --git a/code/step3_literature_embedding_training_Luis.py b/code/step3_literature_embedding_training_Luis.py index d3721d5..c8fcca3 100644 --- a/code/step3_literature_embedding_training_Luis.py +++ b/code/step3_literature_embedding_training_Luis.py @@ -1,20 +1,68 @@ -import os -import torch -from torch.utils.data import Dataset, DataLoader -from tqdm import tqdm -import gc - +# Import necessary libraries +import os # For file and directory operations +import torch # PyTorch library for deep learning +from torch.utils.data import Dataset, DataLoader # For creating custom datasets and data loaders +from tqdm import tqdm # For progress bars +import gc # For garbage collection +import json # For working with JSON data + +# Try to import various transformer models and tokenizers try: - from transformers import AlbertTokenizer, AlbertModel - MODEL_NAME = 'albert-base-v2' - print("Using ALBERT model") -except ImportError: - from transformers import BertTokenizer, BertModel - MODEL_NAME = 'bert-base-uncased' - print("ALBERT not available, using BERT model instead") - + from transformers import ( + AlbertTokenizer, AlbertModel, + BertTokenizer, BertModel, + RobertaTokenizer, RobertaModel, + DistilBertTokenizer, DistilBertModel, + XLNetTokenizer, XLNetModel + ) + # Define a dictionary of available models with their respective tokenizers and model classes + MODELS = { + 'albert': (AlbertTokenizer, AlbertModel, 'albert-base-v2'), + 'bert': (BertTokenizer, BertModel, 'bert-base-uncased'), + 'roberta': (RobertaTokenizer, RobertaModel, 'roberta-base'), + 'distilbert': (DistilBertTokenizer, DistilBertModel, 'distilbert-base-uncased'), + 'xlnet': (XLNetTokenizer, XLNetModel, 'xlnet-base-cased') + } + print("All models available") +except ImportError as e: + # If some models are not available, print an error message + print(f"Some models might not be available: {e}") + MODELS = {} + +# Class for saving embeddings +class EmbeddingSaver: + def __init__(self, base_path): + # Initialize the saver with a base path for saving embeddings + self.base_path = base_path + # Create directories for saving embeddings + os.makedirs(base_path, exist_ok=True) + self.gene_based_path = os.path.join(base_path, 'gene_based') + self.baseline_path = os.path.join(base_path, 'baseline') + os.makedirs(self.gene_based_path, exist_ok=True) + os.makedirs(self.baseline_path, exist_ok=True) + # Initialize metadata dictionary + self.metadata = {'gene_based': {}, 'baseline': {}} + + def save_gene_based(self, embeddings, part): + # Save gene-based embeddings + file_path = os.path.join(self.gene_based_path, f'gene_based_embeddings_part{part}.pt') + torch.save(embeddings, file_path) + self.metadata['gene_based'][f'part{part}'] = file_path + + def save_baseline(self, embeddings, file_name): + # Save baseline embeddings + file_path = os.path.join(self.baseline_path, f'{file_name}_embeddings.pt') + torch.save(embeddings, file_path) + self.metadata['baseline'][file_name] = file_path + + def save_metadata(self): + # Save metadata to a JSON file + with open(os.path.join(self.base_path, 'embedding_metadata.json'), 'w') as f: + json.dump(self.metadata, f, indent=2) + +# Custom Dataset class for processing literature in chunks class ChunkedLiteratureDataset(Dataset): - def __init__(self, file_path, tokenizer, max_length=512, chunk_size=1000): + def __init__(self, file_path, tokenizer, max_length=512, chunk_size=100000): self.file_path = file_path self.tokenizer = tokenizer self.max_length = max_length @@ -23,6 +71,7 @@ def __init__(self, file_path, tokenizer, max_length=512, chunk_size=1000): self.file = open(self.file_path, 'r', encoding='utf-8') def load_chunk(self): + # Load a chunk of data from the file self.current_chunk = [] for _ in range(self.chunk_size): line = self.file.readline() @@ -32,9 +81,11 @@ def load_chunk(self): return len(self.current_chunk) > 0 def __len__(self): - return self.chunk_size + # Return the length of the current chunk + return len(self.current_chunk) def __getitem__(self, idx): + # Get and tokenize an item from the current chunk if idx >= len(self.current_chunk): raise IndexError("Index out of bounds") text = self.current_chunk[idx] @@ -51,29 +102,37 @@ def __getitem__(self, idx): 'attention_mask': encoding['attention_mask'].flatten() } -def process_in_batches(file_path, model, tokenizer, batch_size=32, max_length=512, chunk_size=1000): +# Function to process data in batches +def process_in_batches(file_path, model, tokenizer, embedding_saver, is_gene_based=True, batch_size=300, max_length=512, chunk_size=100000): + # Create a dataset dataset = ChunkedLiteratureDataset(file_path, tokenizer, max_length, chunk_size) - dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) all_embeddings = [] chunk_count = 0 - model.eval() + model.eval() # Set model to evaluation mode - with torch.no_grad(): - while dataset.load_chunk(): + with torch.no_grad(): # Disable gradient calculation + while dataset.load_chunk(): # Process data chunk by chunk + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) chunk_embeddings = [] for batch in tqdm(dataloader, desc=f"Processing chunk {chunk_count}"): + # Move batch to device and get model outputs input_ids = batch['input_ids'].to(model.device) attention_mask = batch['attention_mask'].to(model.device) outputs = model(input_ids, attention_mask=attention_mask) chunk_embeddings.append(outputs.last_hidden_state[:, 0, :].cpu()) + # Concatenate embeddings for the current chunk chunk_embeddings = torch.cat(chunk_embeddings) all_embeddings.append(chunk_embeddings) # Save embeddings periodically if len(all_embeddings) % 10 == 0: - torch.save(torch.cat(all_embeddings), f'gene_based_embeddings_part{chunk_count//10}.pt') + embeddings_to_save = torch.cat(all_embeddings) + if is_gene_based: + embedding_saver.save_gene_based(embeddings_to_save, chunk_count//10) + else: + embedding_saver.save_baseline(embeddings_to_save, os.path.basename(file_path)) all_embeddings = [] # Clear the list to free up memory gc.collect() # Force garbage collection @@ -81,37 +140,50 @@ def process_in_batches(file_path, model, tokenizer, batch_size=32, max_length=51 # Save any remaining embeddings if all_embeddings: - torch.save(torch.cat(all_embeddings), f'gene_based_embeddings_part{chunk_count//10}.pt') + embeddings_to_save = torch.cat(all_embeddings) + if is_gene_based: + embedding_saver.save_gene_based(embeddings_to_save, chunk_count//10) + else: + embedding_saver.save_baseline(embeddings_to_save, os.path.basename(file_path)) print(f"Processed {chunk_count} chunks from {file_path}") # Set up paths -gene_based_path = r"D:\ZIP\arranged\gene_based_consolidated.txt" +gene_based_path = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\arranged\consolidated_gene_docs.txt" baseline_doc_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\preprocessed" +output_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\embeddings" + +# Choose model +model_choice = 'albert' # Can be changed to 'bert', 'roberta', 'distilbert', or 'xlnet' +if model_choice not in MODELS: + raise ValueError(f"Model {model_choice} not available. Choose from: {list(MODELS.keys())}") + +# Get the appropriate classes and model name for the chosen model +TokenizerClass, ModelClass, model_name = MODELS[model_choice] +print(f"Using {model_choice.upper()} model") # Initialize model and tokenizer -tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME) if MODEL_NAME.startswith('albert') else BertTokenizer.from_pretrained(MODEL_NAME) -model = AlbertModel.from_pretrained(MODEL_NAME) if MODEL_NAME.startswith('albert') else BertModel.from_pretrained(MODEL_NAME) +tokenizer = TokenizerClass.from_pretrained(model_name) +model = ModelClass.from_pretrained(model_name) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -model.to(device) +model.to(device) # Move model to GPU if available + +# Initialize embedding saver +embedding_saver = EmbeddingSaver(output_dir) # Process gene-based data print("Processing gene-based data...") -process_in_batches(gene_based_path, model, tokenizer) +process_in_batches(gene_based_path, model, tokenizer, embedding_saver, is_gene_based=True) -# Process baseline documents (unchanged) +# Process baseline documents print("Processing baseline documents...") -baseline_embeddings = [] for file in os.listdir(baseline_doc_dir): if file.endswith('.txt'): file_path = os.path.join(baseline_doc_dir, file) - embeddings = process_in_batches(file_path, model, tokenizer) - baseline_embeddings.append(embeddings) - -# Combine baseline embeddings -baseline_embeddings = torch.cat(baseline_embeddings) + print(f"Processing {file}...") + process_in_batches(file_path, model, tokenizer, embedding_saver, is_gene_based=False, chunk_size=100000) -# Save baseline embeddings -torch.save(baseline_embeddings, 'baseline_embeddings.pt') +# Save metadata +embedding_saver.save_metadata() print("Embedding process completed.") \ No newline at end of file diff --git a/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt b/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt new file mode 100644 index 0000000..9b9fb7e Binary files /dev/null and b/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt differ diff --git a/output/proprocessed_example/gene_based_embeddings_part0.pt b/output/proprocessed_example/gene_based_embeddings_part0.pt new file mode 100644 index 0000000..49c8fac Binary files /dev/null and b/output/proprocessed_example/gene_based_embeddings_part0.pt differ