updated_matrix_generator.py

import string
import re
from nltk.stem import PorterStemmer
import nltk
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
#read file

import glob

word_df = pd.read_csv(r"Concreteness_ratings_Brysbaert_et_al_BRM.csv", encoding ="utf-8")
common_words = word_df["Word"].tolist()

#Change path to the location on your own computer
path = "/Users/qixia/Git/Charles_ContextEffectsOnAbstract_tweets/compilations/*.csv"


path_len = len(path) - 5
for fname in glob.glob(path):

    df = pd.read_csv(fname, encoding ="utf-8")


    df = df.dropna(subset=['location'])
    df = df.reset_index(drop=True)
    md = {'tweet': [], 'user': [], 'location': []}
    for i in range(len(df.index)):
        md['tweet'].append(str(df['text'][i]))
        md['user'].append(str(df['screen_name'][i]))
        md['location'].append(str(df['location'][i]))

    print(len(md['user']))
    print(len(md['tweet']))
    print(len(md['location']))

    tweets = pd.DataFrame(md)

    #lowercase
    tweets['tweet'] = tweets['tweet'].astype(str).str.lower()
    print("lowercase")


    def extractwords(word):
        if (word in common_words):
            return word
        else:
            return ""

    def tokenize(x):
        return x.split()

    tweets['tweet'] = tweets['tweet'].apply(lambda x: ' '.join([extractwords(w) for w in tokenize(x)]))


    #remove white space


    tweets['tweet'] = tweets['tweet'].apply(lambda x: " ".join(str(x).split()))


    document = []
    index = []

    for i in range(len(tweets.index)):
        temp = " ".join(str(tweets["tweet"][i]).split())
        document.append(temp)
        index.append(tweets["user"][i])

    # Scikit Learn
    from sklearn.feature_extraction.text import CountVectorizer

    # Create the Document Term Matrix
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(document)


    # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
    doc_term_matrix = sparse_matrix.todense()
    friq_df = pd.DataFrame(doc_term_matrix, 
                    columns=count_vectorizer.get_feature_names(), 
                    index=index)

    from sklearn.metrics.pairwise import cosine_similarity
    tfidf_similarity = cosine_similarity(friq_df, friq_df)
    tfidf_matrix = pd.DataFrame(tfidf_similarity)

    tfidf_matrix.columns = index
    tfidf_matrix.index = index

    new_fname = "".join([fname[path_len:-4], "_tfidf_matrix.csv"])
    tfidf_matrix.to_csv(new_fname)