Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import string
import re
from nltk.stem import PorterStemmer
import nltk
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
#read file
import glob
word_df = pd.read_csv(r"Concreteness_ratings_Brysbaert_et_al_BRM.csv", encoding ="utf-8")
common_words = word_df["Word"].tolist()
#Change path to the location on your own computer
path = "/Users/qixia/Git/Charles_ContextEffectsOnAbstract_tweets/compilations/*.csv"
path_len = len(path) - 5
for fname in glob.glob(path):
df = pd.read_csv(fname, encoding ="utf-8")
df = df.dropna(subset=['location'])
df = df.reset_index(drop=True)
md = {'tweet': [], 'user': [], 'location': []}
for i in range(len(df.index)):
md['tweet'].append(str(df['text'][i]))
md['user'].append(str(df['screen_name'][i]))
md['location'].append(str(df['location'][i]))
print(len(md['user']))
print(len(md['tweet']))
print(len(md['location']))
tweets = pd.DataFrame(md)
#lowercase
tweets['tweet'] = tweets['tweet'].astype(str).str.lower()
print("lowercase")
def extractwords(word):
if (word in common_words):
return word
else:
return ""
def tokenize(x):
return x.split()
tweets['tweet'] = tweets['tweet'].apply(lambda x: ' '.join([extractwords(w) for w in tokenize(x)]))
#remove white space
tweets['tweet'] = tweets['tweet'].apply(lambda x: " ".join(str(x).split()))
document = []
index = []
for i in range(len(tweets.index)):
temp = " ".join(str(tweets["tweet"][i]).split())
document.append(temp)
index.append(tweets["user"][i])
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(document)
# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
friq_df = pd.DataFrame(doc_term_matrix,
columns=count_vectorizer.get_feature_names(),
index=index)
from sklearn.metrics.pairwise import cosine_similarity
tfidf_similarity = cosine_similarity(friq_df, friq_df)
tfidf_matrix = pd.DataFrame(tfidf_similarity)
tfidf_matrix.columns = index
tfidf_matrix.index = index
new_fname = "".join([fname[path_len:-4], "_tfidf_matrix.csv"])
tfidf_matrix.to_csv(new_fname)