diff --git a/code/step2_data_preprocessing.py b/code/step2_data_preprocessing.py
index 9538008..d20b5d7 100644
--- a/code/step2_data_preprocessing.py
+++ b/code/step2_data_preprocessing.py
@@ -1,63 +1,186 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Jun 21 00:16:25 2020
-
-@author: Jihye Moon
-
-"""
-
 import os
 import pathlib
 import sys
-
+import unicodedata
+import traceback
+import json
+import re
 sys.path.append('lib')  
 import lib.Literature_Data_Preprocessing as ldp
 
-base = sys.argv[1]
-output = sys.argv[2]
-batch_dir = base # os.path.join(base, 'literature_data')
-comb_dir = os.path.join(base, 'arranged')
-preprocessed_dir = os.path.join(output, 'preprocessed')
-pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
-pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)
- 
-lp=ldp.preprocessing(base, batch_dir, comb_dir, preprocessed_dir) 
-
-### Extracting only abstracts and combining all collected files into one file (Gene name based documents)
-file_names, data_list=lp.batch_data_matching(batch_dir, ['gene2document'])
-arr_list = lp.combining_files(file_names, data_list, ['FullText'], 3)
-
-for i in range(len(file_names)):
-    lp.Indexing(os.path.join(comb_dir, file_names[i]), arr_list[file_names[i]])
-    
-gene2doc = lp.gene2doc_mapping(arr_list[file_names[0]])
- 
-
-### Extracting only abstracts and combining all collected files into one file (Word name based documents)
-file_names_doc, data_list_doc = lp.batch_data_matching(batch_dir, ['baseline_doc'])
-arr_list2 = lp.combining_query2doc(file_names_doc, data_list_doc, ['pubmed'], 4) 
-
-
-### Literature Data Preprocessing
-total_FullText = ''; total_meta = ''
-total_size=len(arr_list2[file_names_doc[0]])
-full_handle = open(os.path.join(comb_dir, file_names_doc[0]+'.FullText.txt'), "w")
-meta_handle = open(os.path.join(comb_dir, file_names_doc[0]+'.meta.txt'), "w")
-
-total_FullText=[]
-for i in range(total_size):
-    FullText, Meta = lp.Medine_mapping(arr_list2[file_names_doc[0]][i]) 
-    #print(i, '/', total_size, round(i/total_size,2)*100)
-    total_FullText.append(FullText)
-    full_handle.write(FullText)
-    meta_handle.write(Meta)
-full_handle.close()
-meta_handle.close()
-
-doc_gene=list(gene2doc.keys())
-
-print('----- preprocessing --- for gene name based documents')
-lp.making_doc_data(doc_gene, file_names[0], gene2doc) 
-
-print('----- preprocessing --- for word name based documents')
-lp.making_doc_data(None, file_names_doc[0], total_FullText)
+def check_directory(path):
+    """Check if a directory exists."""
+    if not os.path.exists(path):
+        print(f"Directory does not exist: {path}")
+        return False
+    return True
+
+def safe_read(file_path):
+    """Safely read a file with different encodings."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    except UnicodeDecodeError:
+        try:
+            with open(file_path, 'r', encoding='latin-1') as f:
+                return f.read()
+        except Exception as e:
+            print(f"Error reading file {file_path}: {e}")
+            return None
+    except Exception as e:
+        print(f"Unexpected error reading file {file_path}: {e}")
+        return None
+
+def process_gene_docs_in_chunks(file_path, chunk_size=10000):
+    """Process gene documents in chunks."""
+    gene2doc = {}
+    with open(file_path, 'r', encoding='utf-8') as f:
+        current_gene = None
+        current_doc = []
+        for line in f:
+            if line.startswith("#GENENAME-"):
+                if current_gene and current_doc:
+                    gene2doc[current_gene] = ''.join(current_doc)
+                    if len(gene2doc) % chunk_size == 0:
+                        yield gene2doc
+                        gene2doc = {}
+                current_gene = line.split("-", 1)[1].strip()
+                current_doc = []
+            else:
+                current_doc.append(line)
+        
+        if current_gene and current_doc:
+            gene2doc[current_gene] = ''.join(current_doc)
+        
+        if gene2doc:
+            yield gene2doc
+
+class CustomPreprocessing(ldp.preprocessing):
+    def making_doc_data(self, gene_list, name, dic, mode='w'):
+        """Create preprocessed document data."""
+        preprocessed_dir = self.preprocessed_dir
+        counting = 0
+        handle = open(os.path.join(preprocessed_dir, name+'.data.doc.txt'), mode, encoding='utf-8')
+        if gene_list is None:
+            for i in range(len(dic)): 
+                if counting == 10000:
+                    print(i, '/', len(dic))
+                    counting = 0
+                buffer = dic[i].split('\t')
+                if buffer[0] != '\n':
+                    buffer = buffer[3] + buffer[4]
+                    if buffer != '':
+                        buffer = self.doc_preprocessor(buffer) 
+                        handle.write('-1' + '\t' + buffer + '\n')
+                counting += 1
+        else:
+            for i in range(len(gene_list)): 
+                if counting == 10000:
+                    print(i, '/', len(gene_list))
+                    counting = 0
+                gene_name = gene_list[i]
+                data = dic[gene_name] 
+                buffer = self.doc_preprocessor(data)
+                if buffer != '':
+                    # Extract PMID from the buffer
+                    pmid_match = re.search(r'#PMID-\s*(\d+)', buffer)
+                    if pmid_match:
+                        pmid = pmid_match.group(1)
+                        # Add gene name before PMID
+                        modified_buffer = re.sub(r'(#PMID-\s*\d+)', f'#GENENAME- {gene_name} \\1', buffer)
+                        handle.write('#'+ gene_name + '\t' + modified_buffer + '\n')
+                    else:
+                        # If PMID is not found, just prepend the gene name
+                        handle.write('#'+ gene_name + '\t#GENENAME- ' + gene_name + ' ' + buffer + '\n')
+                counting += 1
+        handle.close()
+
+def main():
+    base = sys.argv[1]
+    output = sys.argv[2]
+
+    # Update paths to match your new structure
+    batch_dir = base
+    gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records')
+    baseline_doc_dir = os.path.join(batch_dir, 'results', 'baseline_doc')
+    comb_dir = os.path.join(output, 'arranged')
+    preprocessed_dir = os.path.join(output, 'gene_based_preprocessed')
+
+    print(f"Checking directories...")
+    print(f"batch_dir: {batch_dir}")
+    print(f"gene_based_dir: {gene_based_dir}")
+    print(f"baseline_doc_dir: {baseline_doc_dir}")
+    print(f"comb_dir: {comb_dir}")
+    print(f"preprocessed_dir: {preprocessed_dir}")
+
+    if not all(map(check_directory, [batch_dir, gene_based_dir, baseline_doc_dir])):
+        sys.exit("One or more required directories do not exist. Please check the paths and try again.")
+
+    pathlib.Path(comb_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)
+
+    lp = CustomPreprocessing(base, batch_dir, comb_dir, preprocessed_dir) 
+
+    # Process gene-based documents
+    consolidated_file = os.path.join(comb_dir, 'consolidated_gene_docs.txt')
+
+    if not os.path.exists(consolidated_file):
+        print("Consolidated file not found. Starting consolidation process...")
+        gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')]
+        print(f"Found {len(gene_files)} gene-based documents.")
+
+        if not gene_files:
+            print("No gene-based documents found. Skipping this step.")
+        else:
+            with open(consolidated_file, 'w', encoding='utf-8') as outfile:
+                for i, file in enumerate(gene_files, 1):
+                    try:
+                        gene_name = os.path.splitext(file)[0]  # Get filename without extension
+                        content = safe_read(os.path.join(gene_based_dir, file))
+                        if content is not None:
+                            content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('ascii')
+                            outfile.write(f"#GENENAME- {gene_name}\n{content}\n\n")
+                        else:
+                            print(f"Skipping file {file} due to reading error.")
+                    except Exception as e:
+                        print(f"Error processing file {file}: {e}")
+                        print(traceback.format_exc())
+                    if i % 1000 == 0:
+                        print(f"Consolidating file {i}/{len(gene_files)}")
+            print("All gene-based documents consolidated.")
+    else:
+        print("Consolidated file found. Skipping consolidation process.")
+
+    print("Processing consolidated gene-based document...")
+    processed_genes_file = os.path.join(preprocessed_dir, 'processed_genes.json')
+    processed_genes = set()
+
+    if os.path.exists(processed_genes_file):
+        with open(processed_genes_file, 'r') as f:
+            processed_genes = set(json.load(f))
+        print(f"Resuming from {len(processed_genes)} previously processed genes.")
+
+    output_file = os.path.join(preprocessed_dir, 'consolidated_gene_docs.data.doc.txt')
+    mode = 'a' if os.path.exists(output_file) else 'w'
+
+    for gene2doc_chunk in process_gene_docs_in_chunks(consolidated_file):
+        print(f"Processing chunk with {len(gene2doc_chunk)} genes...")
+        new_genes = set(gene2doc_chunk.keys()) - processed_genes
+        if new_genes:
+            doc_gene = list(new_genes)
+            lp.making_doc_data(doc_gene, 'consolidated_gene_docs', {g: gene2doc_chunk[g] for g in new_genes}, mode)
+            processed_genes.update(new_genes)
+            
+            # Update the processed genes file
+            with open(processed_genes_file, 'w') as f:
+                json.dump(list(processed_genes), f)
+        else:
+            print("All genes in this chunk have been processed already. Moving to next chunk.")
+        
+        # Change mode to 'a' after first write
+        mode = 'a'
+
+    print("All processing completed.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/code/step2_data_preprocessing_Luis_Custom.py b/code/step2_data_preprocessing_Luis_Custom_document_based.py
similarity index 100%
rename from code/step2_data_preprocessing_Luis_Custom.py
rename to code/step2_data_preprocessing_Luis_Custom_document_based.py
diff --git a/code/step2_data_preprocessing_Luis_final.py b/code/step2_data_preprocessing_Luis_final_nonfinal.py
similarity index 100%
rename from code/step2_data_preprocessing_Luis_final.py
rename to code/step2_data_preprocessing_Luis_final_nonfinal.py
diff --git a/code/step2_data_preprocessing_Luis_genebased_new.py b/code/step2_data_preprocessing_Luis_genebased_new.py
new file mode 100644
index 0000000..27d97f7
--- /dev/null
+++ b/code/step2_data_preprocessing_Luis_genebased_new.py
@@ -0,0 +1,121 @@
+import os
+import pathlib
+import sys
+import unicodedata
+import json
+import re
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import multiprocessing
+sys.path.append('lib')  
+import lib.Literature_Data_Preprocessing as ldp
+
+def check_directory(path):
+    if not os.path.exists(path):
+        print(f"Directory does not exist: {path}")
+        return False
+    return True
+
+def safe_read(file_path):
+    encodings = ['utf-8', 'latin-1']
+    for encoding in encodings:
+        try:
+            with open(file_path, 'r', encoding=encoding) as f:
+                return f.read()
+        except UnicodeDecodeError:
+            continue
+        except Exception as e:
+            print(f"Unexpected error reading file {file_path}: {e}")
+    print(f"Error reading file {file_path}: Unable to decode with available encodings")
+    return None
+
+def load_gene_info(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return {line.strip(): line.strip() for line in f}
+
+class CustomPreprocessing(ldp.preprocessing):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pmid_pattern = re.compile(r'#PMID-\s*(\d+)')
+        
+    def process_gene_file(self, file_path, gene_symbol, gene_name):
+        content = safe_read(file_path)
+        if content is not None:
+            content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('ascii')
+            buffer = self.doc_preprocessor(content)
+            if buffer:
+                pmid_match = self.pmid_pattern.search(buffer)
+                if pmid_match:
+                    modified_buffer = re.sub(r'(#PMID-\s*\d+)', f'#GENESYMBOL- {gene_symbol} #GENENAME- {gene_name} \\1', buffer)
+                    return f'#{gene_symbol}\t{modified_buffer}\n'
+                else:
+                    return f'#{gene_symbol}\t#GENESYMBOL- {gene_symbol} #GENENAME- {gene_name} {buffer}\n'
+        return None
+
+def process_gene(args):
+    lp, file_path, gene_symbol, gene_name = args
+    return lp.process_gene_file(file_path, gene_symbol, gene_name)
+
+def main(base, output):
+    batch_dir = base
+    gene_based_dir = os.path.join(batch_dir, 'results', 'gene_based_records')
+    gene_info_dir = os.path.join(batch_dir, 'data', 'gene_name_info')
+    preprocessed_dir = os.path.join(output, 'gene_based_preprocessed')
+
+    print("Checking directories...")
+    for dir_path in [batch_dir, gene_based_dir, gene_info_dir]:
+        if not check_directory(dir_path):
+            sys.exit(f"Directory does not exist: {dir_path}")
+
+    pathlib.Path(preprocessed_dir).mkdir(parents=True, exist_ok=True)
+
+    lp = CustomPreprocessing(base, batch_dir, None, preprocessed_dir)
+
+    gene_symbols = load_gene_info(os.path.join(gene_info_dir, 'query_symbol.txt'))
+    gene_names = load_gene_info(os.path.join(gene_info_dir, 'query_full_name.txt'))
+
+    output_file = os.path.join(preprocessed_dir, 'consolidated_gene_docs.data.doc.txt')
+    processed_genes_file = os.path.join(preprocessed_dir, 'processed_genes.json')
+
+    processed_genes = set()
+    if os.path.exists(processed_genes_file):
+        with open(processed_genes_file, 'r') as f:
+            processed_genes = set(json.load(f))
+        print(f"Resuming from {len(processed_genes)} previously processed genes.")
+
+    gene_files = [f for f in os.listdir(gene_based_dir) if f.endswith('.txt')]
+    total_files = len(gene_files)
+    print(f"Found {total_files} gene-based documents.")
+
+    num_processes = multiprocessing.cpu_count()
+    
+    with ProcessPoolExecutor(max_workers=num_processes) as executor:
+        futures = []
+        for file in gene_files:
+            gene_symbol = os.path.splitext(file)[0]
+            if gene_symbol not in processed_genes:
+                gene_name = gene_names.get(gene_symbol, gene_symbol)
+                file_path = os.path.join(gene_based_dir, file)
+                futures.append(executor.submit(process_gene, (lp, file_path, gene_symbol, gene_name)))
+
+        with open(output_file, 'a', encoding='utf-8') as outfile:
+            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing genes"):
+                result = future.result()
+                if result:
+                    outfile.write(result)
+                    gene_symbol = result.split('\t')[0][1:]  # Extract gene symbol from the result
+                    processed_genes.add(gene_symbol)
+
+                # Save progress every 1000 genes
+                if len(processed_genes) % 1000 == 0:
+                    with open(processed_genes_file, 'w') as pf:
+                        json.dump(list(processed_genes), pf)
+
+    # Final save of processed genes
+    with open(processed_genes_file, 'w') as pf:
+        json.dump(list(processed_genes), pf)
+
+    print("All processing completed.")
+
+if __name__ == "__main__":
+    main(sys.argv[1], sys.argv[2])
\ No newline at end of file
diff --git a/code/step2_data_preprocessing_Luis_new.py b/code/step2_data_preprocessing_Luis_new_nonfinal.py
similarity index 76%
rename from code/step2_data_preprocessing_Luis_new.py
rename to code/step2_data_preprocessing_Luis_new_nonfinal.py
index 2250b4d..6af53c5 100644
--- a/code/step2_data_preprocessing_Luis_new.py
+++ b/code/step2_data_preprocessing_Luis_new_nonfinal.py
@@ -4,6 +4,8 @@
 import unicodedata
 import traceback
 import json
+import re
+
 sys.path.append('lib')  
 import lib.Literature_Data_Preprocessing as ldp
 
@@ -55,7 +57,32 @@ def process_gene_docs_in_chunks(file_path, chunk_size=10000):
         if gene2doc:
             yield gene2doc
 
+def load_gene_info(gene_info_dir):
+    gene_names_file = os.path.join(gene_info_dir, 'query_full_name.txt')
+    gene_symbols_file = os.path.join(gene_info_dir, 'query_symbol.txt')
+    
+    gene_names = {}
+    gene_symbols = {}
+    
+    with open(gene_names_file, 'r') as f:
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 2:
+                gene_names[parts[0]] = parts[1]
+    
+    with open(gene_symbols_file, 'r') as f:
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 2:
+                gene_symbols[parts[0]] = parts[1]
+    
+    return gene_names, gene_symbols
+
 class CustomPreprocessing(ldp.preprocessing):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.gene_names, self.gene_symbols = load_gene_info(os.path.join(args[0], 'data', 'gene_name_info'))
+
     def making_doc_data(self, gene_list, name, dic, mode='w'):
         preprocessed_dir = self.preprocessed_dir
         counting = 0
@@ -78,13 +105,30 @@ def making_doc_data(self, gene_list, name, dic, mode='w'):
                 if counting == 10000:
                     print(i, '/', len(gene_list))
                     counting = 0
-                data = dic[gene_list[i]] 
+                gene_name = gene_list[i]
+                data = dic[gene_name] 
                 buffer = self.doc_preprocessor(data)
                 if buffer != '':
-                    handle.write('#'+ gene_list[i] + '\t' + buffer + '\n')
+                    gene_symbol = self.gene_symbols.get(gene_name, "Unknown")
+                    gene_full_name = self.gene_names.get(gene_name, "Unknown")
+                    
+                    # Add gene name, symbol, and ID at the beginning
+                    gene_info = f'#GENENAME- {gene_name} #GENESYMBOL- {gene_symbol} #GENEID- {gene_full_name} '
+                    
+                    # Extract PMID from the buffer and move it after gene info
+                    pmid_match = re.search(r'#PMID-\s*(\d+)', buffer)
+                    if pmid_match:
+                        pmid = pmid_match.group(0)
+                        buffer = re.sub(r'#PMID-\s*\d+', '', buffer)  # Remove PMID from its original position
+                        modified_buffer = gene_info + pmid + ' ' + buffer.strip()
+                    else:
+                        modified_buffer = gene_info + buffer
+                    
+                    handle.write('#' + gene_name + '\t' + modified_buffer + '\n')
                 counting += 1
         handle.close()
 
+# The rest of the script remains the same
 base = sys.argv[1]
 output = sys.argv[2]
 
diff --git a/code/step3_literature_embedding_LDA_Luis.py b/code/step3_literature_embedding_LDA_Luis.py
new file mode 100644
index 0000000..ba15f60
--- /dev/null
+++ b/code/step3_literature_embedding_LDA_Luis.py
@@ -0,0 +1,150 @@
+import os
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from gensim import corpora
+from gensim.models import LdaMulticore, CoherenceModel
+from gensim.parsing.preprocessing import STOPWORDS
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from wordcloud import WordCloud
+import pyLDAvis
+import pyLDAvis.gensim_models as gensimvis
+
+# Download necessary NLTK data
+nltk.download('punkt')
+nltk.download('stopwords')
+
+# Function to preprocess text
+def preprocess(text):
+    tokens = word_tokenize(text.lower())
+    stop_words = set(stopwords.words('english')).union(STOPWORDS)
+    tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
+    return tokens
+
+# Function to read and preprocess documents
+def read_and_preprocess_docs(directory, gene_file=None):
+    documents = []
+    doc_names = []
+    
+    for filename in os.listdir(directory):
+        if filename.endswith('.txt'):
+            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
+                text = file.read()
+                documents.append(preprocess(text))
+                doc_names.append(filename)
+    
+    if gene_file:
+        with open(gene_file, 'r', encoding='utf-8') as file:
+            gene_docs = file.read().split('#GENENAME-')
+            for gene_doc in gene_docs[1:]:
+                gene_name = gene_doc.split('\n', 1)[0].strip()
+                gene_text = gene_doc.split('\n', 1)[1]
+                documents.append(preprocess(gene_text))
+                doc_names.append(f"Gene_{gene_name}")
+    
+    return documents, doc_names
+
+# Set the directory paths
+baseline_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\preprocessed"
+gene_file = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\gene_based_preprocessed\consolidated_gene_docs.data.doc.txt"
+
+# Read and preprocess documents
+docs, doc_names = read_and_preprocess_docs(baseline_dir, gene_file)
+
+# Create dictionary and corpus
+dictionary = corpora.Dictionary(docs)
+corpus = [dictionary.doc2bow(doc) for doc in docs]
+
+# Function to compute coherence values
+def compute_coherence_values(dictionary, corpus, texts, start, limit, step):
+    coherence_values = []
+    model_list = []
+    for num_topics in range(start, limit, step):
+        model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100)
+        model_list.append(model)
+        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
+        coherence_values.append(coherencemodel.get_coherence())
+    return model_list, coherence_values
+
+# Compute coherence values
+model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=docs, start=5, limit=50, step=5)
+
+# Find the optimal number of topics
+optimal_num_topics = coherence_values.index(max(coherence_values)) * 5 + 5
+
+# Train LDA model with optimal number of topics
+lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=optimal_num_topics, 
+                         random_state=100, chunksize=100, passes=10, per_word_topics=True)
+
+# Function to format topic distribution
+def format_topics_sentences(ldamodel, corpus, doc_names):
+    sent_topics_df = pd.DataFrame()
+    for i, row in enumerate(ldamodel[corpus]):
+        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
+        for j, (topic_num, prop_topic) in enumerate(row):
+            if j == 0:
+                wp = ldamodel.show_topic(topic_num)
+                topic_keywords = ", ".join([word for word, prop in wp])
+                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), 
+                                                                  topic_keywords, doc_names[i]]), 
+                                                       ignore_index=True)
+            else:
+                break
+    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Document']
+    return sent_topics_df
+
+# Format results
+df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, doc_names=doc_names)
+
+# Save results to CSV
+df_topic_sents_keywords.to_csv('lda_results_combined.csv', index=False)
+
+# Analyze ZIP11 and Zinc Homeostasis related topics
+zip11_zinc_topics = df_topic_sents_keywords[
+    df_topic_sents_keywords['Topic_Keywords'].str.contains('zip11|zinc|homeostasis', case=False)
+]
+print("\nTopics related to ZIP11 or Zinc Homeostasis:")
+print(zip11_zinc_topics[['Dominant_Topic', 'Topic_Keywords']])
+
+# Visualization 1: Topic Distribution
+plt.figure(figsize=(12, 6))
+topic_dist = df_topic_sents_keywords['Dominant_Topic'].value_counts().sort_index()
+sns.barplot(x=topic_dist.index, y=topic_dist.values)
+plt.title('Distribution of Dominant Topics')
+plt.xlabel('Topic Number')
+plt.ylabel('Number of Documents')
+plt.savefig('topic_distribution.png')
+plt.close()
+
+# Visualization 2: Word Cloud of ZIP11 and Zinc Homeostasis related topics
+zip11_zinc_text = ' '.join(zip11_zinc_topics['Topic_Keywords'])
+wordcloud = WordCloud(width=800, height=400, background_color='white').generate(zip11_zinc_text)
+plt.figure(figsize=(10, 5))
+plt.imshow(wordcloud, interpolation='bilinear')
+plt.axis('off')
+plt.title('Word Cloud: ZIP11 and Zinc Homeostasis Topics')
+plt.savefig('zip11_zinc_wordcloud.png')
+plt.close()
+
+# Visualization 3: pyLDAvis
+vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
+pyLDAvis.save_html(vis_data, 'lda_visualization.html')
+
+print("Analysis complete. Results saved to CSV and visualizations saved as PNG files.")
+print("Interactive LDA visualization saved as 'lda_visualization.html'")
+
+# Additional analysis: Top words for ZIP11 and Zinc Homeostasis topics
+print("\nTop words for ZIP11 and Zinc Homeostasis related topics:")
+for topic in zip11_zinc_topics['Dominant_Topic'].unique():
+    print(f"Topic {topic}:")
+    print(lda_model.show_topic(topic))
+    print()
+
+# Document-Topic Distribution for ZIP11 and Zinc Homeostasis documents
+zip11_zinc_docs = df_topic_sents_keywords[
+    df_topic_sents_keywords['Document'].str.contains('zip11|zinc|homeostasis', case=False)
+]
+print("\nDocument-Topic Distribution for ZIP11 and Zinc Homeostasis documents:")
+print(zip11_zinc_docs[['Document', 'Dominant_Topic', 'Perc_Contribution']])
\ No newline at end of file
diff --git a/code/step3_literature_embedding_training_Luis.py b/code/step3_literature_embedding_training_Luis.py
index c8fcca3..972fb7e 100644
--- a/code/step3_literature_embedding_training_Luis.py
+++ b/code/step3_literature_embedding_training_Luis.py
@@ -149,7 +149,7 @@ def process_in_batches(file_path, model, tokenizer, embedding_saver, is_gene_bas
     print(f"Processed {chunk_count} chunks from {file_path}")
 
 # Set up paths
-gene_based_path = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\arranged\consolidated_gene_docs.txt"
+gene_based_path = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\gene_based_preprocessed\consolidated_gene_docs.data.doc.txt"
 baseline_doc_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\preprocessed"
 output_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\embeddings"
 
diff --git a/code/step3_literature_embedding_training_Luis_new.py b/code/step3_literature_embedding_training_Luis_new.py
new file mode 100644
index 0000000..11a17f3
--- /dev/null
+++ b/code/step3_literature_embedding_training_Luis_new.py
@@ -0,0 +1,332 @@
+import os
+import torch
+from torch.utils.data import IterableDataset
+from tqdm import tqdm
+import gc
+import json
+import numpy as np
+from collections import Counter
+from torch import nn
+import torch.optim as optim
+import random
+import itertools
+import pickle
+
+# Try to import various transformer models and tokenizers
+try:
+    from transformers import (
+        AlbertTokenizer, AlbertModel,
+        BertTokenizer, BertModel,
+        RobertaTokenizer, RobertaModel,
+        DistilBertTokenizer, DistilBertModel,
+        XLNetTokenizer, XLNetModel
+    )
+    # Define a dictionary of available models with their respective tokenizers and model classes
+    MODELS = {
+        'albert': (AlbertTokenizer, AlbertModel, 'albert-base-v2'),
+        'bert': (BertTokenizer, BertModel, 'bert-base-uncased'),
+        'roberta': (RobertaTokenizer, RobertaModel, 'roberta-base'),
+        'distilbert': (DistilBertTokenizer, DistilBertModel, 'distilbert-base-uncased'),
+        'xlnet': (XLNetTokenizer, XLNetModel, 'xlnet-base-cased')
+    }
+    print("All models available")
+except ImportError as e:
+    # If some models are not available, print an error message
+    print(f"Some models might not be available: {e}")
+    MODELS = {}
+
+class Vocabulary:
+    def __init__(self, min_count=5, min_size=2):
+        self.word2idx = {}
+        self.idx2word = []
+        self.word_counts = Counter()
+        self.min_count = min_count
+        self.min_size = min_size
+
+    def add_document(self, doc):
+        for word in doc.split():
+            if len(word) >= self.min_size:
+                self.word_counts[word] += 1
+
+    def build_vocab(self):
+        for word, count in self.word_counts.items():
+            if count >= self.min_count:
+                self.idx2word.append(word)
+                self.word2idx[word] = len(self.idx2word) - 1
+
+    def save_vocab(self, path):
+        with open(os.path.join(path, 'vocabulary.json'), 'w') as f:
+            json.dump(self.word2idx, f)
+        with open(os.path.join(path, 'reverse_vocabulary.json'), 'w') as f:
+            json.dump(self.idx2word, f)
+
+class Gene2DocDataset(IterableDataset):
+    def __init__(self, gene2doc_file, vocabulary, window_size=2, chunk_size=1000):
+        self.gene2doc_file = gene2doc_file
+        self.vocabulary = vocabulary
+        self.window_size = window_size
+        self.chunk_size = chunk_size
+
+    def __iter__(self):
+        with open(self.gene2doc_file, 'r', encoding='utf-8') as f:
+            chunk = []
+            for line in f:
+                parts = line.strip().split('\t')
+                if len(parts) == 2:
+                    gene, doc = parts
+                    words = [self.vocabulary.word2idx[w] for w in doc.split() if w in self.vocabulary.word2idx]
+                    chunk.append(words)
+                    
+                    if len(chunk) >= self.chunk_size:
+                        yield from self.process_chunk(chunk)
+                        chunk = []
+            
+            if chunk:  # Process any remaining data
+                yield from self.process_chunk(chunk)
+
+    def process_chunk(self, chunk):
+        for words in chunk:
+            for i in range(len(words)):
+                for j in range(max(0, i - self.window_size), min(len(words), i + self.window_size + 1)):
+                    if i != j:
+                        yield torch.tensor(words[i]), torch.tensor(words[j])
+
+class Word2DocDataset(IterableDataset):
+    def __init__(self, baseline_doc_dir, vocabulary, window_size=2, chunk_size=1000):
+        self.baseline_doc_dir = baseline_doc_dir
+        self.vocabulary = vocabulary
+        self.window_size = window_size
+        self.chunk_size = chunk_size
+
+    def __iter__(self):
+        for filename in os.listdir(self.baseline_doc_dir):
+            if filename.endswith('.data.doc.txt'):
+                file_path = os.path.join(self.baseline_doc_dir, filename)
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    chunk = []
+                    for line in f:
+                        words = [self.vocabulary.word2idx[w] for w in line.strip().split() if w in self.vocabulary.word2idx]
+                        chunk.append(words)
+                        
+                        if len(chunk) >= self.chunk_size:
+                            yield from self.process_chunk(chunk)
+                            chunk = []
+                    
+                    if chunk:  # Process any remaining data
+                        yield from self.process_chunk(chunk)
+
+    def process_chunk(self, chunk):
+        for words in chunk:
+            for i in range(len(words)):
+                for j in range(max(0, i - self.window_size), min(len(words), i + self.window_size + 1)):
+                    if i != j:
+                        yield torch.tensor(words[i]), torch.tensor(words[j])
+
+class SkipGramModel(nn.Module):
+    def __init__(self, vocab_size, embedding_dim):
+        super(SkipGramModel, self).__init__()
+        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
+        self.linear = nn.Linear(embedding_dim, vocab_size)
+
+    def forward(self, inputs):
+        embeds = self.embeddings(inputs)
+        out = self.linear(embeds)
+        return out
+
+class EmbeddingSaver:
+    def __init__(self, base_path):
+        self.base_path = base_path
+        os.makedirs(base_path, exist_ok=True)
+        self.metadata = {}
+
+    def save_embeddings(self, embeddings, vocabulary):
+        embedding_path = os.path.join(self.base_path, 'embeddings.npy')
+        np.save(embedding_path, embeddings)
+        
+        vocabulary.save_vocab(self.base_path)
+        
+        self.metadata['embeddings'] = embedding_path
+        self.metadata['vocabulary'] = os.path.join(self.base_path, 'vocabulary.json')
+        self.metadata['reverse_vocabulary'] = os.path.join(self.base_path, 'reverse_vocabulary.json')
+
+    def save_metadata(self):
+        with open(os.path.join(self.base_path, 'embedding_metadata.json'), 'w') as f:
+            json.dump(self.metadata, f, indent=2)
+
+def save_vocabulary_checkpoint(vocab, output_dir):
+    checkpoint_path = os.path.join(output_dir, 'vocabulary_checkpoint.pkl')
+    with open(checkpoint_path, 'wb') as f:
+        pickle.dump(vocab, f)
+    print(f"Vocabulary checkpoint saved to {checkpoint_path}")
+
+def load_vocabulary_checkpoint(output_dir):
+    checkpoint_path = os.path.join(output_dir, 'vocabulary_checkpoint.pkl')
+    if os.path.exists(checkpoint_path):
+        with open(checkpoint_path, 'rb') as f:
+            vocab = pickle.load(f)
+        print(f"Vocabulary checkpoint loaded from {checkpoint_path}")
+        return vocab
+    return None
+
+def save_dataset_checkpoint(dataset, output_dir, dataset_name):
+    checkpoint_path = os.path.join(output_dir, f'{dataset_name}_checkpoint.pkl')
+    with open(checkpoint_path, 'wb') as f:
+        pickle.dump(dataset, f)
+    print(f"{dataset_name} checkpoint saved to {checkpoint_path}")
+
+def load_dataset_checkpoint(output_dir, dataset_name):
+    checkpoint_path = os.path.join(output_dir, f'{dataset_name}_checkpoint.pkl')
+    if os.path.exists(checkpoint_path):
+        with open(checkpoint_path, 'rb') as f:
+            dataset = pickle.load(f)
+        print(f"{dataset_name} checkpoint loaded from {checkpoint_path}")
+        return dataset
+    return None
+
+def save_checkpoint(model, optimizer, epoch, loss, path):
+    torch.save({
+        'epoch': epoch,
+        'model_state_dict': model.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict(),
+        'loss': loss,
+    }, path)
+
+def load_checkpoint(model, optimizer, path):
+    if os.path.exists(path):
+        checkpoint = torch.load(path)
+        model.load_state_dict(checkpoint['model_state_dict'])
+        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        epoch = checkpoint['epoch']
+        loss = checkpoint['loss']
+        print(f"Checkpoint loaded from {path}")
+        return model, optimizer, epoch, loss
+    return model, optimizer, 0, float('inf')
+
+def train_model(model, gene2doc_dataset, word2doc_dataset, device, output_dir, num_epochs=10, batch_size=128, save_interval=1):
+    optimizer = optim.Adam(model.parameters())
+    criterion = nn.CrossEntropyLoss()
+
+    # Load checkpoint if exists
+    checkpoint_path = os.path.join(output_dir, 'latest_checkpoint.pt')
+    model, optimizer, start_epoch, best_loss = load_checkpoint(model, optimizer, checkpoint_path)
+
+    for epoch in range(start_epoch, num_epochs):
+        total_loss = 0
+        batch_count = 0
+        
+        combined_data = itertools.chain(gene2doc_dataset, word2doc_dataset)
+        
+        batch = []
+        for item in combined_data:
+            batch.append(item)
+            if len(batch) == batch_size:
+                inputs, targets = zip(*batch)
+                inputs = torch.stack(inputs).to(device)
+                targets = torch.stack(targets).to(device)
+                
+                optimizer.zero_grad()
+                outputs = model(inputs)
+                loss = criterion(outputs, targets)
+                loss.backward()
+                optimizer.step()
+                
+                total_loss += loss.item()
+                batch_count += 1
+                
+                batch = []
+                
+                if batch_count % 1000 == 0:
+                    print(f"Epoch {epoch+1}, Batch {batch_count}, Loss: {total_loss/batch_count:.4f}")
+        
+        if batch:
+            inputs, targets = zip(*batch)
+            inputs = torch.stack(inputs).to(device)
+            targets = torch.stack(targets).to(device)
+            
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = criterion(outputs, targets)
+            loss.backward()
+            optimizer.step()
+            
+            total_loss += loss.item()
+            batch_count += 1
+
+        avg_loss = total_loss / batch_count
+        print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")
+
+        # Save checkpoint
+        save_checkpoint(model, optimizer, epoch + 1, avg_loss, checkpoint_path)
+        print(f"Checkpoint saved to {checkpoint_path}")
+
+        if avg_loss < best_loss:
+            best_loss = avg_loss
+            best_model_path = os.path.join(output_dir, 'best_model.pt')
+            torch.save(model.state_dict(), best_model_path)
+            print(f"Best model saved to {best_model_path}")
+
+    return model
+
+# Main execution
+if __name__ == "__main__":
+    # Set up paths
+    gene_based_path = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\gene_based_preprocessed\consolidated_gene_docs.data.doc.txt"
+    baseline_doc_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\preprocessed"
+    output_dir = r"C:\Users\lrm22005\OneDrive - University of Connecticut\Research\ZIP11_Bioinformatic\capsule-3642152\ZIP11\output\embeddings"
+
+    # Create vocabulary
+    vocab = load_vocabulary_checkpoint(output_dir)
+    if vocab is None:
+        print("Creating vocabulary...")
+        vocab = Vocabulary(min_count=5, min_size=2)
+        with open(gene_based_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                vocab.add_document(line.split('\t')[1])
+        
+        for filename in os.listdir(baseline_doc_dir):
+            if filename.endswith('.data.doc.txt'):
+                file_path = os.path.join(baseline_doc_dir, filename)
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    for line in f:
+                        vocab.add_document(line)
+        
+        vocab.build_vocab()
+        save_vocabulary_checkpoint(vocab, output_dir)
+
+    # Save vocabulary
+    vocab.save_vocab(output_dir)
+    print(f"Vocabulary saved to {output_dir}")
+
+    # Create datasets
+    print("Creating datasets...")
+    gene2doc_dataset = load_dataset_checkpoint(output_dir, 'gene2doc')
+    if gene2doc_dataset is None:
+        gene2doc_dataset = Gene2DocDataset(gene_based_path, vocab, chunk_size=500)
+        save_dataset_checkpoint(gene2doc_dataset, output_dir, 'gene2doc')
+
+    word2doc_dataset = load_dataset_checkpoint(output_dir, 'word2doc')
+    if word2doc_dataset is None:
+        word2doc_dataset = Word2DocDataset(baseline_doc_dir, vocab, chunk_size=500)
+        save_dataset_checkpoint(word2doc_dataset, output_dir, 'word2doc')
+
+    # Initialize model
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = SkipGramModel(len(vocab.idx2word), embedding_dim=128).to(device)
+
+    # Train model
+    print("Training model...")
+    trained_model = train_model(model, gene2doc_dataset, word2doc_dataset, device, output_dir, save_interval=1)
+
+    # Save final model and embeddings
+    print("Saving final model and embeddings...")
+    final_model_path = os.path.join(output_dir, 'final_model.pt')
+    torch.save(trained_model.state_dict(), final_model_path)
+    print(f"Final model saved to {final_model_path}")
+
+    embedding_saver = EmbeddingSaver(output_dir)
+    embeddings = trained_model.embeddings.weight.cpu().detach().numpy()
+    embedding_saver.save_embeddings(embeddings, vocab)
+    embedding_saver.save_metadata()
+    print(f"Final embeddings and metadata saved to {output_dir}")
+
+    print("Embedding process completed.")
\ No newline at end of file
diff --git a/code/step4_CVD_risk_factor_identification_Luis.py b/code/step4_CVD_risk_factor_identification_Luis.py
new file mode 100644
index 0000000..448dbc6
--- /dev/null
+++ b/code/step4_CVD_risk_factor_identification_Luis.py
@@ -0,0 +1,104 @@
+import os
+import torch
+import numpy as np
+import json
+from sklearn.metrics.pairwise import cosine_similarity
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+
+class EmbeddingModel:
+    def __init__(self, embedding_path, vocab_path, gene_symbol_path=None):
+        self.embeddings = np.load(embedding_path)
+        with open(vocab_path, 'r') as f:
+            self.word2idx = json.load(f)
+        self.idx2word = {v: k for k, v in self.word2idx.items()}
+        
+        self.gene_symbols = set()
+        if gene_symbol_path:
+            with open(gene_symbol_path, 'r') as f:
+                self.gene_symbols = set(line.strip() for line in f)
+
+    def get_word_vector(self, word):
+        if word in self.word2idx:
+            return self.embeddings[self.word2idx[word]]
+        else:
+            raise ValueError(f"Word '{word}' not in vocabulary")
+
+    def find_similar_words(self, query, n=25, include_similarity=True):
+        if isinstance(query, str):
+            query_vector = self.get_word_vector(query)
+        else:
+            query_vector = query
+        
+        similarities = cosine_similarity([query_vector], self.embeddings)[0]
+        top_n = similarities.argsort()[-n:][::-1]
+        
+        if include_similarity:
+            return [(self.idx2word[i], similarities[i]) for i in top_n]
+        else:
+            return [self.idx2word[i] for i in top_n]
+
+    def similarity_display(self, query, output_path, top_n=25):
+        similar_words = self.find_similar_words(query, n=top_n)
+        
+        os.makedirs(output_path, exist_ok=True)
+        output_file = os.path.join(output_path, f"{query.replace(' ', '_')}_similar_words.txt")
+        
+        with open(output_file, 'w') as f:
+            f.write(f"Query: {query}\n")
+            f.write("Similar words:\n")
+            for word, similarity in similar_words:
+                f.write(f"{word}: {similarity:.4f}\n")
+        
+        print(f"Results saved to {output_file}")
+
+    def visualize_words(self, words, filename=None):
+        word_vectors = [self.get_word_vector(word) for word in words]
+        tsne = TSNE(n_components=2, random_state=42)
+        reduced_vectors = tsne.fit_transform(word_vectors)
+
+        plt.figure(figsize=(12, 8))
+        for i, word in enumerate(words):
+            x, y = reduced_vectors[i]
+            plt.scatter(x, y)
+            plt.annotate(word, (x, y))
+        
+        plt.title("Word Embeddings Visualization")
+        if filename:
+            plt.savefig(filename)
+        plt.show()
+
+class RunIntrinsicEvaluation:
+    def __init__(self):
+        self.model = None
+
+    def setting(self, path, gene_symb):
+        embedding_path = os.path.join(path, 'embeddings.npy')
+        vocab_path = os.path.join(path, 'vocabulary.json')
+        self.model = EmbeddingModel(embedding_path, vocab_path, gene_symb)
+
+    def running(self, query, output_path, top_n):
+        if self.model is None:
+            raise ValueError("Model not set. Call setting() first.")
+        self.model.similarity_display(query, output_path, top_n)
+
+# Usage example
+if __name__ == "__main__":
+    import sys
+    
+    model_path = str(sys.argv[1])
+    output_path = str(sys.argv[2])
+    gene_symb_path = '../data/gene_name_info/query_symbol'
+
+    queries = ['Zn Transport', 'protein names', 'drug interactions', 'cancer', 'drug names', 'protein drug', 'zinc', 'zn pathway']
+    TOPNUM = 25
+
+    evaluator = RunIntrinsicEvaluation()
+    evaluator.setting(path=model_path, gene_symb=gene_symb_path)
+
+    for query in queries:
+        evaluator.running(query, output_path, TOPNUM)
+
+    # Visualization example
+    words_to_visualize = ["gene", "protein", "dna", "rna", "cell", "mutation"]
+    evaluator.model.visualize_words(words_to_visualize, os.path.join(output_path, "word_embeddings_visualization.png"))
\ No newline at end of file
diff --git a/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt b/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt
deleted file mode 100644
index 9b9fb7e..0000000
Binary files a/output/proprocessed_example/baseline_embeddings_chelators_and_ZIP11.data.doc_part0.pt and /dev/null differ
diff --git a/output/proprocessed_example/gene_based_embeddings_part0.pt b/output/proprocessed_example/gene_based_embeddings_part0.pt
deleted file mode 100644
index 49c8fac..0000000
Binary files a/output/proprocessed_example/gene_based_embeddings_part0.pt and /dev/null differ