Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import string
import re
from nltk.stem import PorterStemmer
import nltk
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
#read file
df = pd.read_csv(r"clean_file_3.csv", encoding ="latin-1")
document = []
index = []
for i in range(len(df.index)):
temp = " ".join(str(df["tweet"][i]).split())
document.append(temp)
index.append(df["user"][i])
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(document)
# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
friq_df = pd.DataFrame(doc_term_matrix,
columns=count_vectorizer.get_feature_names(),
index=index)
from sklearn.metrics.pairwise import cosine_similarity
tfidf_similarity = cosine_similarity(friq_df, friq_df)
tfidf_matrix = pd.DataFrame(tfidf_similarity)
tfidf_matrix.columns = index
tfidf_matrix.index = index
tfidf_matrix.to_csv("tfidf_matrix.csv")