diff --git a/code/step2_data_preprocessing.py b/code/step2_data_preprocessing.py index 9538008..d20b5d7 100644 --- a/code/step2_data_preprocessing.py +++ b/code/step2_data_preprocessing.py @@ -1,63 +1,186 @@ -# -*- coding: utf-8 -*- -""" -Created on Sun Jun 21 00:16:25 2020 - -@author: Jihye Moon - -""" - import os import pathlib import sys - +import unicodedata +import traceback +import json +import re sys.path.append('lib') import lib.Literature_Data_Preprocessing as ldp -base = sys.argv[1] -output = sys.argv[2] -batch_dir = base # os.path.join(base, 'literature_data') -comb_dir = os.path.join(base, 'arranged') -preprocessed_dir = os.path.join(output, 'preprocessed') -pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True) -pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True) - -lp=ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir) - -### Extracting only abstracts and combining all collected files into one file (Gene name based documents) -file_names, data_list=lp.batch_data_matching(batch_dir, ['gene2document']) -arr_list = lp.combining_files(file_names, data_list, ['FullText'], 3) - -for i in range(len(file_names)): - lp.Indexing(os.path.join(comb_dir, file_names[i]), arr_list[file_names[i]]) - -gene2doc = lp.gene2doc_mapping(arr_list[file_names[0]]) - - -### Extracting only abstracts and combining all collected files into one file (Word name based documents) -file_names_doc, data_list_doc = lp.batch_data_matching(batch_dir, ['baseline_doc']) -arr_list2 = lp.combining_query2doc(file_names_doc, data_list_doc, ['pubmed'], 4) - - -### Literature Data Preprocessing -total_FullText = ''; total_meta = '' -total_size=len(arr_list2[file_names_doc[0]]) -full_handle = open(os.path.join(comb_dir, file_names_doc[0]+'.FullText.txt'), "w") -meta_handle = open(os.path.join(comb_dir, file_names_doc[0]+'.meta.txt'), "w") - -total_FullText=[] -for i in range(total_size): - FullText, Meta = lp.Medine_mapping(arr_list2[file_names_doc[0]][i]) - #print(i, '/', total_size, round(i/total_size,2)*100) - total_FullText.append(FullText) - full_handle.write(FullText) - meta_handle.write(Meta) -full_handle.close() -meta_handle.close() - -doc_gene=list(gene2doc.keys()) - -print('----- preprocessing --- for gene name based documents') -lp.making_doc_data(doc_gene, file_names[0], gene2doc) - -print('----- preprocessing --- for word name based documents') -lp.making_doc_data(None, file_names_doc[0], total_FullText) +def check_directory(path): + """Check if a directory exists.""" + if not os.path.exists(path): + print(f"Directory does not exist: {path}") + return False + return True + +def safe_read(file_path): + """Safely read a file with different encodings.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + except UnicodeDecodeError: + try: + with open(file_path, 'r', encoding='latin-1') as f: + return f.read() + except Exception as e: + print(f"Error reading file {file_path}: {e}") + return None + except Exception as e: + print(f"Unexpected error reading file {file_path}: {e}") + return None + +def process_gene_docs_in_chunks(file_path, chunk_size=10000): + """Process gene documents in chunks.""" + gene2doc = {} + with open(file_path, 'r', encoding='utf-8') as f: + current_gene = None + current_doc = [] + for line in f: + if line.startswith("#GENENAME-"): + if current_gene and current_doc: + gene2doc[current_gene] = ''.join(current_doc) + if len(gene2doc) % chunk_size == 0: + yield gene2doc + gene2doc = {} + current_gene = line.split("-", 1)[1].strip() + current_doc = [] + else: + current_doc.append(line) + + if current_gene and current_doc: + gene2doc[current_gene] = ''.join(current_doc) + + if gene2doc: + yield gene2doc + +class CustomPreprocessing(ldp.preprocessing): + def making_doc_data(self, gene_list, name, dic, mode='w'): + """Create preprocessed document data.""" + preprocessed_dir = self.preprocessed_dir + counting = 0 + handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), mode, encoding='utf-8') + if gene_list is None: + for i in range(len(dic)): + if counting == 10000: + print(i, '/', len(dic)) + counting = 0 + buffer = dic[i].split('\t') + if buffer[0] != '\n': + buffer = buffer[3] + buffer[4] + if buffer != '': + buffer = self.doc_preprocessor(buffer) + handle.write('-1' + '\t' + buffer + '\n') + counting += 1 + else: + for i in range(len(gene_list)): + if counting == 10000: + print(i, '/', len(gene_list)) + counting = 0 + gene_name = gene_list[i] + data = dic[gene_name] + buffer = self.doc_preprocessor(data) + if buffer != '': + # Extract PMID from the buffer + pmid_match = re.search(r'#PMID-\s*(\d+)', buffer) + if pmid_match: + pmid = pmid_match.group(1) + # Add gene name before PMID + modified_buffer = re.sub(r'(#PMID-\s*\d+)', f'#GENENAME- {gene_name} \\1', buffer) + handle.write('#'+ gene_name + '\t' + modified_buffer + '\n') + else: + # If PMID is not found, just prepend the gene name + handle.write('#'+ gene_name + '\t#GENENAME- ' + gene_name + ' ' + buffer + '\n') + counting += 1 + handle.close() + +def main(): + base = sys.argv[1] + output = sys.argv[2] + + # Update paths to match your new structure + batch_dir = base + gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records') + baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc') + comb_dir = os.path.join(output, 'arranged') + preprocessed_dir = os.path.join(output, 'gene_based_preprocessed') + + print(f"Checking directories...") + print(f"batch_dir: {batch_dir}") + print(f"gene_based_dir: {gene_based_dir}") + print(f"baseline_doc_dir: {baseline_doc_dir}") + print(f"comb_dir: {comb_dir}") + print(f"preprocessed_dir: {preprocessed_dir}") + + if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])): + sys.exit("One or more required directories do not exist. Please check the paths and try again.") + + pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True) + pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True) + + lp = CustomPreprocessing(base, batch_dir, comb_dir, preprocessed_dir) + + # Process gene-based documents + consolidated_file = os.path.join(comb_dir, 'consolidated_gene_docs.txt') + + if not os.path.exists(consolidated_file): + print("Consolidated file not found. Starting consolidation process...") + gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')] + print(f"Found {len(gene_files)} gene-based documents.") + + if not gene_files: + print("No gene-based documents found. Skipping this step.") + else: + with open(consolidated_file, 'w', encoding='utf-8') as outfile: + for i, file in enumerate(gene_files, 1): + try: + gene_name = os.path.splitext(file)[0] # Get filename without extension + content = safe_read(os.path.join(gene_based_dir, file)) + if content is not None: + content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('ascii') + outfile.write(f"#GENENAME- {gene_name}\n{content}\n\n") + else: + print(f"Skipping file {file} due to reading error.") + except Exception as e: + print(f"Error processing file {file}: {e}") + print(traceback.format_exc()) + if i % 1000 == 0: + print(f"Consolidating file {i}/{len(gene_files)}") + print("All gene-based documents consolidated.") + else: + print("Consolidated file found. Skipping consolidation process.") + + print("Processing consolidated gene-based document...") + processed_genes_file = os.path.join(preprocessed_dir, 'processed_genes.json') + processed_genes = set() + + if os.path.exists(processed_genes_file): + with open(processed_genes_file, 'r') as f: + processed_genes = set(json.load(f)) + print(f"Resuming from {len(processed_genes)} previously processed genes.") + + output_file = os.path.join(preprocessed_dir, 'consolidated_gene_docs.data.doc.txt') + mode = 'a' if os.path.exists(output_file) else 'w' + + for gene2doc_chunk in process_gene_docs_in_chunks(consolidated_file): + print(f"Processing chunk with {len(gene2doc_chunk)} genes...") + new_genes = set(gene2doc_chunk.keys()) - processed_genes + if new_genes: + doc_gene = list(new_genes) + lp.making_doc_data(doc_gene, 'consolidated_gene_docs', {g: gene2doc_chunk[g] for g in new_genes}, mode) + processed_genes.update(new_genes) + + # Update the processed genes file + with open(processed_genes_file, 'w') as f: + json.dump(list(processed_genes), f) + else: + print("All genes in this chunk have been processed already. Moving to next chunk.") + + # Change mode to 'a' after first write + mode = 'a' + + print("All processing completed.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/code/step2_data_preprocessing_Luis_Custom.py b/code/step2_data_preprocessing_Luis_Custom_document_based.py similarity index 100% rename from code/step2_data_preprocessing_Luis_Custom.py rename to code/step2_data_preprocessing_Luis_Custom_document_based.py diff --git a/code/step2_data_preprocessing_Luis_final.py b/code/step2_data_preprocessing_Luis_final_nonfinal.py similarity index 100% rename from code/step2_data_preprocessing_Luis_final.py rename to code/step2_data_preprocessing_Luis_final_nonfinal.py diff --git a/code/step2_data_preprocessing_Luis_genebased_new.py b/code/step2_data_preprocessing_Luis_genebased_new.py new file mode 100644 index 0000000..27d97f7 --- /dev/null +++ b/code/step2_data_preprocessing_Luis_genebased_new.py @@ -0,0 +1,121 @@ +import os +import pathlib +import sys +import unicodedata +import json +import re +from tqdm import tqdm +from concurrent.futures import ProcessPoolExecutor, as_completed +import multiprocessing +sys.path.append('lib') +import lib.Literature_Data_Preprocessing as ldp + +def check_directory(path): + if not os.path.exists(path): + print(f"Directory does not exist: {path}") + return False + return True + +def safe_read(file_path): + encodings = ['utf-8', 'latin-1'] + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + return f.read() + except UnicodeDecodeError: + continue + except Exception as e: + print(f"Unexpected error reading file {file_path}: {e}") + print(f"Error reading file {file_path}: Unable to decode with available encodings") + return None + +def load_gene_info(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + return {line.strip(): line.strip() for line in f} + +class CustomPreprocessing(ldp.preprocessing): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.pmid_pattern = re.compile(r'#PMID-\s*(\d+)') + + def process_gene_file(self, file_path, gene_symbol, gene_name): + content = safe_read(file_path) + if content is not None: + content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('ascii') + buffer = self.doc_preprocessor(content) + if buffer: + pmid_match = self.pmid_pattern.search(buffer) + if pmid_match: + modified_buffer = re.sub(r'(#PMID-\s*\d+)', f'#GENESYMBOL- {gene_symbol} #GENENAME- {gene_name} \\1', buffer) + return f'#{gene_symbol}\t{modified_buffer}\n' + else: + return f'#{gene_symbol}\t#GENESYMBOL- {gene_symbol} #GENENAME- {gene_name} {buffer}\n' + return None + +def process_gene(args): + lp, file_path, gene_symbol, gene_name = args + return lp.process_gene_file(file_path, gene_symbol, gene_name) + +def main(base, output): + batch_dir = base + gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records') + gene_info_dir = os.path.join(batch_dir, 'data', 'gene_name_info') + preprocessed_dir = os.path.join(output, 'gene_based_preprocessed') + + print("Checking directories...") + for dir_path in [batch_dir, gene_based_dir, gene_info_dir]: + if not check_directory(dir_path): + sys.exit(f"Directory does not exist: {dir_path}") + + pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True) + + lp = CustomPreprocessing(base, batch_dir, None, preprocessed_dir) + + gene_symbols = load_gene_info(os.path.join(gene_info_dir, 'query_symbol.txt')) + gene_names = load_gene_info(os.path.join(gene_info_dir, 'query_full_name.txt')) + + output_file = os.path.join(preprocessed_dir, 'consolidated_gene_docs.data.doc.txt') + processed_genes_file = os.path.join(preprocessed_dir, 'processed_genes.json') + + processed_genes = set() + if os.path.exists(processed_genes_file): + with open(processed_genes_file, 'r') as f: + processed_genes = set(json.load(f)) + print(f"Resuming from {len(processed_genes)} previously processed genes.") + + gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')] + total_files = len(gene_files) + print(f"Found {total_files} gene-based documents.") + + num_processes = multiprocessing.cpu_count() + + with ProcessPoolExecutor(max_workers=num_processes) as executor: + futures = [] + for file in gene_files: + gene_symbol = os.path.splitext(file)[0] + if gene_symbol not in processed_genes: + gene_name = gene_names.get(gene_symbol, gene_symbol) + file_path = os.path.join(gene_based_dir, file) + futures.append(executor.submit(process_gene, (lp, file_path, gene_symbol, gene_name))) + + with open(output_file, 'a', encoding='utf-8') as outfile: + for future in tqdm(as_completed(futures), total=len(futures), desc="Processing genes"): + result = future.result() + if result: + outfile.write(result) + gene_symbol = result.split('\t')[0][1:] # Extract gene symbol from the result + processed_genes.add(gene_symbol) + + # Save progress every 1000 genes + if len(processed_genes) % 1000 == 0: + with open(processed_genes_file, 'w') as pf: + json.dump(list(processed_genes), pf) + + # Final save of processed genes + with open(processed_genes_file, 'w') as pf: + json.dump(list(processed_genes), pf) + + print("All processing completed.") + +if __name__ == "__main__": + main(sys.argv[1], sys.argv[2]) \ No newline at end of file diff --git a/code/step2_data_preprocessing_Luis_new.py b/code/step2_data_preprocessing_Luis_new_nonfinal.py similarity index 76% rename from code/step2_data_preprocessing_Luis_new.py rename to code/step2_data_preprocessing_Luis_new_nonfinal.py index 2250b4d..6af53c5 100644 --- a/code/step2_data_preprocessing_Luis_new.py +++ b/code/step2_data_preprocessing_Luis_new_nonfinal.py @@ -4,6 +4,8 @@ import unicodedata import traceback import json +import re + sys.path.append('lib') import lib.Literature_Data_Preprocessing as ldp @@ -55,7 +57,32 @@ def process_gene_docs_in_chunks(file_path, chunk_size=10000): if gene2doc: yield gene2doc +def load_gene_info(gene_info_dir): + gene_names_file = os.path.join(gene_info_dir, 'query_full_name.txt') + gene_symbols_file = os.path.join(gene_info_dir, 'query_symbol.txt') + + gene_names = {} + gene_symbols = {} + + with open(gene_names_file, 'r') as f: + for line in f: + parts = line.strip().split('\t') + if len(parts) == 2: + gene_names[parts[0]] = parts[1] + + with open(gene_symbols_file, 'r') as f: + for line in f: + parts = line.strip().split('\t') + if len(parts) == 2: + gene_symbols[parts[0]] = parts[1] + + return gene_names, gene_symbols + class CustomPreprocessing(ldp.preprocessing): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.gene_names, self.gene_symbols = load_gene_info(os.path.join(args[0], 'data', 'gene_name_info')) + def making_doc_data(self, gene_list, name, dic, mode='w'): preprocessed_dir = self.preprocessed_dir counting = 0 @@ -78,13 +105,30 @@ def making_doc_data(self, gene_list, name, dic, mode='w'): if counting == 10000: print(i, '/', len(gene_list)) counting = 0 - data = dic[gene_list[i]] + gene_name = gene_list[i] + data = dic[gene_name] buffer = self.doc_preprocessor(data) if buffer != '': - handle.write('#'+ gene_list[i] + '\t' + buffer + '\n') + gene_symbol = self.gene_symbols.get(gene_name, "Unknown") + gene_full_name = self.gene_names.get(gene_name, "Unknown") + + # Add gene name, symbol, and ID at the beginning + gene_info = f'#GENENAME- {gene_name} #GENESYMBOL- {gene_symbol} #GENEID- {gene_full_name} ' + + # Extract PMID from the buffer and move it after gene info + pmid_match = re.search(r'#PMID-\s*(\d+)', buffer) + if pmid_match: + pmid = pmid_match.group(0) + buffer = re.sub(r'#PMID-\s*\d+', '', buffer) # Remove PMID from its original position + modified_buffer = gene_info + pmid + ' ' + buffer.strip() + else: + modified_buffer = gene_info + buffer + + handle.write('#' + gene_name + '\t' + modified_buffer + '\n') counting += 1 handle.close() +# The rest of the script remains the same base = sys.argv[1] output = sys.argv[2] diff --git a/code/step3_literature_embedding_LDA_Luis.py b/code/step3_literature_embedding_LDA_Luis.py new file mode 100644 index 0000000..ba15f60 --- /dev/null +++ b/code/step3_literature_embedding_LDA_Luis.py @@ -0,0 +1,150 @@ +import os +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from gensim import corpora +from gensim.models import LdaMulticore, CoherenceModel +from gensim.parsing.preprocessing import STOPWORDS +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from wordcloud import WordCloud +import pyLDAvis +import pyLDAvis.gensim_models as gensimvis + +# Download necessary NLTK data +nltk.download('punkt') +nltk.download('stopwords') + +# Function to preprocess text +def preprocess(text): + tokens = word_tokenize(text.lower()) + stop_words = set(stopwords.words('english')).union(STOPWORDS) + tokens = [token for token in tokens if token not in stop_words and len(token) > 3] + return tokens + +# Function to read and preprocess documents +def read_and_preprocess_docs(directory, gene_file=None): + documents = [] + doc_names = [] + + for filename in os.listdir(directory): + if filename.endswith('.txt'): + with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file: + text = file.read() + documents.append(preprocess(text)) + doc_names.append(filename) + + if gene_file: + with open(gene_file, 'r', encoding='utf-8') as file: + gene_docs = file.read().split('#GENENAME-') + for gene_doc in gene_docs[1:]: + gene_name = gene_doc.split('\n', 1)[0].strip() + gene_text = gene_doc.split('\n', 1)[1] + documents.append(preprocess(gene_text)) + doc_names.append(f"Gene_{gene_name}") + + return documents, doc_names + +# Set the directory paths +baseline_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\preprocessed" +gene_file = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\gene_based_preprocessed\consolidated_gene_docs.data.doc.txt" + +# Read and preprocess documents +docs, doc_names = read_and_preprocess_docs(baseline_dir, gene_file) + +# Create dictionary and corpus +dictionary = corpora.Dictionary(docs) +corpus = [dictionary.doc2bow(doc) for doc in docs] + +# Function to compute coherence values +def compute_coherence_values(dictionary, corpus, texts, start, limit, step): + coherence_values = [] + model_list = [] + for num_topics in range(start, limit, step): + model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100) + model_list.append(model) + coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') + coherence_values.append(coherencemodel.get_coherence()) + return model_list, coherence_values + +# Compute coherence values +model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=docs, start=5, limit=50, step=5) + +# Find the optimal number of topics +optimal_num_topics = coherence_values.index(max(coherence_values)) * 5 + 5 + +# Train LDA model with optimal number of topics +lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=optimal_num_topics, + random_state=100, chunksize=100, passes=10, per_word_topics=True) + +# Function to format topic distribution +def format_topics_sentences(ldamodel, corpus, doc_names): + sent_topics_df = pd.DataFrame() + for i, row in enumerate(ldamodel[corpus]): + row = sorted(row[0], key=lambda x: (x[1]), reverse=True) + for j, (topic_num, prop_topic) in enumerate(row): + if j == 0: + wp = ldamodel.show_topic(topic_num) + topic_keywords = ", ".join([word for word, prop in wp]) + sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), + topic_keywords, doc_names[i]]), + ignore_index=True) + else: + break + sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Document'] + return sent_topics_df + +# Format results +df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, doc_names=doc_names) + +# Save results to CSV +df_topic_sents_keywords.to_csv('lda_results_combined.csv', index=False) + +# Analyze ZIP11 and Zinc Homeostasis related topics +zip11_zinc_topics = df_topic_sents_keywords[ + df_topic_sents_keywords['Topic_Keywords'].str.contains('zip11|zinc|homeostasis', case=False) +] +print("\nTopics related to ZIP11 or Zinc Homeostasis:") +print(zip11_zinc_topics[['Dominant_Topic', 'Topic_Keywords']]) + +# Visualization 1: Topic Distribution +plt.figure(figsize=(12, 6)) +topic_dist = df_topic_sents_keywords['Dominant_Topic'].value_counts().sort_index() +sns.barplot(x=topic_dist.index, y=topic_dist.values) +plt.title('Distribution of Dominant Topics') +plt.xlabel('Topic Number') +plt.ylabel('Number of Documents') +plt.savefig('topic_distribution.png') +plt.close() + +# Visualization 2: Word Cloud of ZIP11 and Zinc Homeostasis related topics +zip11_zinc_text = ' '.join(zip11_zinc_topics['Topic_Keywords']) +wordcloud = WordCloud(width=800, height=400, background_color='white').generate(zip11_zinc_text) +plt.figure(figsize=(10, 5)) +plt.imshow(wordcloud, interpolation='bilinear') +plt.axis('off') +plt.title('Word Cloud: ZIP11 and Zinc Homeostasis Topics') +plt.savefig('zip11_zinc_wordcloud.png') +plt.close() + +# Visualization 3: pyLDAvis +vis_data = gensimvis.prepare(lda_model, corpus, dictionary) +pyLDAvis.save_html(vis_data, 'lda_visualization.html') + +print("Analysis complete. Results saved to CSV and visualizations saved as PNG files.") +print("Interactive LDA visualization saved as 'lda_visualization.html'") + +# Additional analysis: Top words for ZIP11 and Zinc Homeostasis topics +print("\nTop words for ZIP11 and Zinc Homeostasis related topics:") +for topic in zip11_zinc_topics['Dominant_Topic'].unique(): + print(f"Topic {topic}:") + print(lda_model.show_topic(topic)) + print() + +# Document-Topic Distribution for ZIP11 and Zinc Homeostasis documents +zip11_zinc_docs = df_topic_sents_keywords[ + df_topic_sents_keywords['Document'].str.contains('zip11|zinc|homeostasis', case=False) +] +print("\nDocument-Topic Distribution for ZIP11 and Zinc Homeostasis documents:") +print(zip11_zinc_docs[['Document', 'Dominant_Topic', 'Perc_Contribution']]) \ No newline at end of file diff --git a/code/step3_literature_embedding_training_Luis.py b/code/step3_literature_embedding_training_Luis.py index c8fcca3..972fb7e 100644 --- a/code/step3_literature_embedding_training_Luis.py +++ b/code/step3_literature_embedding_training_Luis.py @@ -149,7 +149,7 @@ def process_in_batches(file_path, model, tokenizer, embedding_saver, is_gene_bas print(f"Processed {chunk_count} chunks from {file_path}") # Set up paths -gene_based_path = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\arranged\consolidated_gene_docs.txt" +gene_based_path = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\gene_based_preprocessed\consolidated_gene_docs.data.doc.txt" baseline_doc_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\preprocessed" output_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\embeddings" diff --git a/code/step3_literature_embedding_training_Luis_new.py b/code/step3_literature_embedding_training_Luis_new.py new file mode 100644 index 0000000..11a17f3 --- /dev/null +++ b/code/step3_literature_embedding_training_Luis_new.py @@ -0,0 +1,332 @@ +import os +import torch +from torch.utils.data import IterableDataset +from tqdm import tqdm +import gc +import json +import numpy as np +from collections import Counter +from torch import nn +import torch.optim as optim +import random +import itertools +import pickle + +# Try to import various transformer models and tokenizers +try: + from transformers import ( + AlbertTokenizer, AlbertModel, + BertTokenizer, BertModel, + RobertaTokenizer, RobertaModel, + DistilBertTokenizer, DistilBertModel, + XLNetTokenizer, XLNetModel + ) + # Define a dictionary of available models with their respective tokenizers and model classes + MODELS = { + 'albert': (AlbertTokenizer, AlbertModel, 'albert-base-v2'), + 'bert': (BertTokenizer, BertModel, 'bert-base-uncased'), + 'roberta': (RobertaTokenizer, RobertaModel, 'roberta-base'), + 'distilbert': (DistilBertTokenizer, DistilBertModel, 'distilbert-base-uncased'), + 'xlnet': (XLNetTokenizer, XLNetModel, 'xlnet-base-cased') + } + print("All models available") +except ImportError as e: + # If some models are not available, print an error message + print(f"Some models might not be available: {e}") + MODELS = {} + +class Vocabulary: + def __init__(self, min_count=5, min_size=2): + self.word2idx = {} + self.idx2word = [] + self.word_counts = Counter() + self.min_count = min_count + self.min_size = min_size + + def add_document(self, doc): + for word in doc.split(): + if len(word) >= self.min_size: + self.word_counts[word] += 1 + + def build_vocab(self): + for word, count in self.word_counts.items(): + if count >= self.min_count: + self.idx2word.append(word) + self.word2idx[word] = len(self.idx2word) - 1 + + def save_vocab(self, path): + with open(os.path.join(path, 'vocabulary.json'), 'w') as f: + json.dump(self.word2idx, f) + with open(os.path.join(path, 'reverse_vocabulary.json'), 'w') as f: + json.dump(self.idx2word, f) + +class Gene2DocDataset(IterableDataset): + def __init__(self, gene2doc_file, vocabulary, window_size=2, chunk_size=1000): + self.gene2doc_file = gene2doc_file + self.vocabulary = vocabulary + self.window_size = window_size + self.chunk_size = chunk_size + + def __iter__(self): + with open(self.gene2doc_file, 'r', encoding='utf-8') as f: + chunk = [] + for line in f: + parts = line.strip().split('\t') + if len(parts) == 2: + gene, doc = parts + words = [self.vocabulary.word2idx[w] for w in doc.split() if w in self.vocabulary.word2idx] + chunk.append(words) + + if len(chunk) >= self.chunk_size: + yield from self.process_chunk(chunk) + chunk = [] + + if chunk: # Process any remaining data + yield from self.process_chunk(chunk) + + def process_chunk(self, chunk): + for words in chunk: + for i in range(len(words)): + for j in range(max(0, i - self.window_size), min(len(words), i + self.window_size + 1)): + if i != j: + yield torch.tensor(words[i]), torch.tensor(words[j]) + +class Word2DocDataset(IterableDataset): + def __init__(self, baseline_doc_dir, vocabulary, window_size=2, chunk_size=1000): + self.baseline_doc_dir = baseline_doc_dir + self.vocabulary = vocabulary + self.window_size = window_size + self.chunk_size = chunk_size + + def __iter__(self): + for filename in os.listdir(self.baseline_doc_dir): + if filename.endswith('.data.doc.txt'): + file_path = os.path.join(self.baseline_doc_dir, filename) + with open(file_path, 'r', encoding='utf-8') as f: + chunk = [] + for line in f: + words = [self.vocabulary.word2idx[w] for w in line.strip().split() if w in self.vocabulary.word2idx] + chunk.append(words) + + if len(chunk) >= self.chunk_size: + yield from self.process_chunk(chunk) + chunk = [] + + if chunk: # Process any remaining data + yield from self.process_chunk(chunk) + + def process_chunk(self, chunk): + for words in chunk: + for i in range(len(words)): + for j in range(max(0, i - self.window_size), min(len(words), i + self.window_size + 1)): + if i != j: + yield torch.tensor(words[i]), torch.tensor(words[j]) + +class SkipGramModel(nn.Module): + def __init__(self, vocab_size, embedding_dim): + super(SkipGramModel, self).__init__() + self.embeddings = nn.Embedding(vocab_size, embedding_dim) + self.linear = nn.Linear(embedding_dim, vocab_size) + + def forward(self, inputs): + embeds = self.embeddings(inputs) + out = self.linear(embeds) + return out + +class EmbeddingSaver: + def __init__(self, base_path): + self.base_path = base_path + os.makedirs(base_path, exist_ok=True) + self.metadata = {} + + def save_embeddings(self, embeddings, vocabulary): + embedding_path = os.path.join(self.base_path, 'embeddings.npy') + np.save(embedding_path, embeddings) + + vocabulary.save_vocab(self.base_path) + + self.metadata['embeddings'] = embedding_path + self.metadata['vocabulary'] = os.path.join(self.base_path, 'vocabulary.json') + self.metadata['reverse_vocabulary'] = os.path.join(self.base_path, 'reverse_vocabulary.json') + + def save_metadata(self): + with open(os.path.join(self.base_path, 'embedding_metadata.json'), 'w') as f: + json.dump(self.metadata, f, indent=2) + +def save_vocabulary_checkpoint(vocab, output_dir): + checkpoint_path = os.path.join(output_dir, 'vocabulary_checkpoint.pkl') + with open(checkpoint_path, 'wb') as f: + pickle.dump(vocab, f) + print(f"Vocabulary checkpoint saved to {checkpoint_path}") + +def load_vocabulary_checkpoint(output_dir): + checkpoint_path = os.path.join(output_dir, 'vocabulary_checkpoint.pkl') + if os.path.exists(checkpoint_path): + with open(checkpoint_path, 'rb') as f: + vocab = pickle.load(f) + print(f"Vocabulary checkpoint loaded from {checkpoint_path}") + return vocab + return None + +def save_dataset_checkpoint(dataset, output_dir, dataset_name): + checkpoint_path = os.path.join(output_dir, f'{dataset_name}_checkpoint.pkl') + with open(checkpoint_path, 'wb') as f: + pickle.dump(dataset, f) + print(f"{dataset_name} checkpoint saved to {checkpoint_path}") + +def load_dataset_checkpoint(output_dir, dataset_name): + checkpoint_path = os.path.join(output_dir, f'{dataset_name}_checkpoint.pkl') + if os.path.exists(checkpoint_path): + with open(checkpoint_path, 'rb') as f: + dataset = pickle.load(f) + print(f"{dataset_name} checkpoint loaded from {checkpoint_path}") + return dataset + return None + +def save_checkpoint(model, optimizer, epoch, loss, path): + torch.save({ + 'epoch': epoch, + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'loss': loss, + }, path) + +def load_checkpoint(model, optimizer, path): + if os.path.exists(path): + checkpoint = torch.load(path) + model.load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + epoch = checkpoint['epoch'] + loss = checkpoint['loss'] + print(f"Checkpoint loaded from {path}") + return model, optimizer, epoch, loss + return model, optimizer, 0, float('inf') + +def train_model(model, gene2doc_dataset, word2doc_dataset, device, output_dir, num_epochs=10, batch_size=128, save_interval=1): + optimizer = optim.Adam(model.parameters()) + criterion = nn.CrossEntropyLoss() + + # Load checkpoint if exists + checkpoint_path = os.path.join(output_dir, 'latest_checkpoint.pt') + model, optimizer, start_epoch, best_loss = load_checkpoint(model, optimizer, checkpoint_path) + + for epoch in range(start_epoch, num_epochs): + total_loss = 0 + batch_count = 0 + + combined_data = itertools.chain(gene2doc_dataset, word2doc_dataset) + + batch = [] + for item in combined_data: + batch.append(item) + if len(batch) == batch_size: + inputs, targets = zip(*batch) + inputs = torch.stack(inputs).to(device) + targets = torch.stack(targets).to(device) + + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + total_loss += loss.item() + batch_count += 1 + + batch = [] + + if batch_count % 1000 == 0: + print(f"Epoch {epoch+1}, Batch {batch_count}, Loss: {total_loss/batch_count:.4f}") + + if batch: + inputs, targets = zip(*batch) + inputs = torch.stack(inputs).to(device) + targets = torch.stack(targets).to(device) + + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + total_loss += loss.item() + batch_count += 1 + + avg_loss = total_loss / batch_count + print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}") + + # Save checkpoint + save_checkpoint(model, optimizer, epoch + 1, avg_loss, checkpoint_path) + print(f"Checkpoint saved to {checkpoint_path}") + + if avg_loss < best_loss: + best_loss = avg_loss + best_model_path = os.path.join(output_dir, 'best_model.pt') + torch.save(model.state_dict(), best_model_path) + print(f"Best model saved to {best_model_path}") + + return model + +# Main execution +if __name__ == "__main__": + # Set up paths + gene_based_path = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\gene_based_preprocessed\consolidated_gene_docs.data.doc.txt" + baseline_doc_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\preprocessed" + output_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\embeddings" + + # Create vocabulary + vocab = load_vocabulary_checkpoint(output_dir) + if vocab is None: + print("Creating vocabulary...") + vocab = Vocabulary(min_count=5, min_size=2) + with open(gene_based_path, 'r', encoding='utf-8') as f: + for line in f: + vocab.add_document(line.split('\t')[1]) + + for filename in os.listdir(baseline_doc_dir): + if filename.endswith('.data.doc.txt'): + file_path = os.path.join(baseline_doc_dir, filename) + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + vocab.add_document(line) + + vocab.build_vocab() + save_vocabulary_checkpoint(vocab, output_dir) + + # Save vocabulary + vocab.save_vocab(output_dir) + print(f"Vocabulary saved to {output_dir}") + + # Create datasets + print("Creating datasets...") + gene2doc_dataset = load_dataset_checkpoint(output_dir, 'gene2doc') + if gene2doc_dataset is None: + gene2doc_dataset = Gene2DocDataset(gene_based_path, vocab, chunk_size=500) + save_dataset_checkpoint(gene2doc_dataset, output_dir, 'gene2doc') + + word2doc_dataset = load_dataset_checkpoint(output_dir, 'word2doc') + if word2doc_dataset is None: + word2doc_dataset = Word2DocDataset(baseline_doc_dir, vocab, chunk_size=500) + save_dataset_checkpoint(word2doc_dataset, output_dir, 'word2doc') + + # Initialize model + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model = SkipGramModel(len(vocab.idx2word), embedding_dim=128).to(device) + + # Train model + print("Training model...") + trained_model = train_model(model, gene2doc_dataset, word2doc_dataset, device, output_dir, save_interval=1) + + # Save final model and embeddings + print("Saving final model and embeddings...") + final_model_path = os.path.join(output_dir, 'final_model.pt') + torch.save(trained_model.state_dict(), final_model_path) + print(f"Final model saved to {final_model_path}") + + embedding_saver = EmbeddingSaver(output_dir) + embeddings = trained_model.embeddings.weight.cpu().detach().numpy() + embedding_saver.save_embeddings(embeddings, vocab) + embedding_saver.save_metadata() + print(f"Final embeddings and metadata saved to {output_dir}") + + print("Embedding process completed.") \ No newline at end of file diff --git a/code/step4_CVD_risk_factor_identification_Luis.py b/code/step4_CVD_risk_factor_identification_Luis.py new file mode 100644 index 0000000..448dbc6 --- /dev/null +++ b/code/step4_CVD_risk_factor_identification_Luis.py @@ -0,0 +1,104 @@ +import os +import torch +import numpy as np +import json +from sklearn.metrics.pairwise import cosine_similarity +import matplotlib.pyplot as plt +from sklearn.manifold import TSNE + +class EmbeddingModel: + def __init__(self, embedding_path, vocab_path, gene_symbol_path=None): + self.embeddings = np.load(embedding_path) + with open(vocab_path, 'r') as f: + self.word2idx = json.load(f) + self.idx2word = {v: k for k, v in self.word2idx.items()} + + self.gene_symbols = set() + if gene_symbol_path: + with open(gene_symbol_path, 'r') as f: + self.gene_symbols = set(line.strip() for line in f) + + def get_word_vector(self, word): + if word in self.word2idx: + return self.embeddings[self.word2idx[word]] + else: + raise ValueError(f"Word '{word}' not in vocabulary") + + def find_similar_words(self, query, n=25, include_similarity=True): + if isinstance(query, str): + query_vector = self.get_word_vector(query) + else: + query_vector = query + + similarities = cosine_similarity([query_vector], self.embeddings)[0] + top_n = similarities.argsort()[-n:][::-1] + + if include_similarity: + return [(self.idx2word[i], similarities[i]) for i in top_n] + else: + return [self.idx2word[i] for i in top_n] + + def similarity_display(self, query, output_path, top_n=25): + similar_words = self.find_similar_words(query, n=top_n) + + os.makedirs(output_path, exist_ok=True) + output_file = os.path.join(output_path, f"{query.replace(' ', '_')}_similar_words.txt") + + with open(output_file, 'w') as f: + f.write(f"Query: {query}\n") + f.write("Similar words:\n") + for word, similarity in similar_words: + f.write(f"{word}: {similarity:.4f}\n") + + print(f"Results saved to {output_file}") + + def visualize_words(self, words, filename=None): + word_vectors = [self.get_word_vector(word) for word in words] + tsne = TSNE(n_components=2, random_state=42) + reduced_vectors = tsne.fit_transform(word_vectors) + + plt.figure(figsize=(12, 8)) + for i, word in enumerate(words): + x, y = reduced_vectors[i] + plt.scatter(x, y) + plt.annotate(word, (x, y)) + + plt.title("Word Embeddings Visualization") + if filename: + plt.savefig(filename) + plt.show() + +class RunIntrinsicEvaluation: + def __init__(self): + self.model = None + + def setting(self, path, gene_symb): + embedding_path = os.path.join(path, 'embeddings.npy') + vocab_path = os.path.join(path, 'vocabulary.json') + self.model = EmbeddingModel(embedding_path, vocab_path, gene_symb) + + def running(self, query, output_path, top_n): + if self.model is None: + raise ValueError("Model not set. Call setting() first.") + self.model.similarity_display(query, output_path, top_n) + +# Usage example +if __name__ == "__main__": + import sys + + model_path = str(sys.argv[1]) + output_path = str(sys.argv[2]) + gene_symb_path = '../data/gene_name_info/query_symbol' + + queries = ['Zn Transport', 'protein names', 'drug interactions', 'cancer', 'drug names', 'protein drug', 'zinc', 'zn pathway'] + TOPNUM = 25 + + evaluator = RunIntrinsicEvaluation() + evaluator.setting(path=model_path, gene_symb=gene_symb_path) + + for query in queries: + evaluator.running(query, output_path, TOPNUM) + + # Visualization example + words_to_visualize = ["gene", "protein", "dna", "rna", "cell", "mutation"] + evaluator.model.visualize_words(words_to_visualize, os.path.join(output_path, "word_embeddings_visualization.png")) \ No newline at end of file diff --git a/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt b/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt deleted file mode 100644 index 9b9fb7e..0000000 Binary files a/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt and /dev/null differ diff --git a/output/proprocessed_example/gene_based_embeddings_part0.pt b/output/proprocessed_example/gene_based_embeddings_part0.pt deleted file mode 100644 index 49c8fac..0000000 Binary files a/output/proprocessed_example/gene_based_embeddings_part0.pt and /dev/null differ