Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
binary-classification-of-tweets-about-protest/utils.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
102 lines (91 sloc)
4.14 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import read_csv | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score, classification_report, recall_score, precision_score, accuracy_score, f1_score | |
from sklearn.linear_model import LogisticRegression | |
from imblearn.over_sampling import RandomOverSampler | |
from imblearn.under_sampling import RandomUnderSampler | |
from termcolor import colored | |
from collections import Counter | |
import pyinputplus as pyip | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
import numpy | |
from sklearn.preprocessing import MinMaxScaler | |
def get_user_menu_input(options,prompt): | |
answer = pyip.inputMenu(["All"] + options,lettered=True,prompt=colored("{prompt}\n".format(prompt=prompt),"blue")) | |
if(answer == "All"): | |
return options | |
return [answer] | |
def get_user_string_input(prompt): | |
return pyip.inputStr(colored("{prompt}\n".format(prompt=prompt),"blue")) | |
# Without topic, accuracy falls to 79% | |
# Without capitals, -1% | |
# Favorite, status,_count | |
def extract_features(df, gram, cluster): | |
X = df.drop(columns=['text'],axis=1).values | |
tfidf_vectorizer = TfidfVectorizer(ngram_range=(gram,gram)) | |
tf_idf = tfidf_vectorizer.fit_transform(df['text'].values.astype('U')) | |
kmeans = KMeans(init="k-means++", | |
n_clusters=cluster, | |
n_init=30, | |
max_iter=500, | |
random_state=42).fit(tf_idf) | |
my_cluster = kmeans.labels_ | |
Y = numpy.array([my_cluster]) | |
unscaled_features = numpy.concatenate((X,Y.T),axis=1) | |
scaler = MinMaxScaler(feature_range=(0, 1)) | |
rescaled_features = scaler.fit_transform(unscaled_features) | |
return rescaled_features | |
def get_data(option, gram, cluster): | |
if(option == 0): | |
df_may = read_csv('data/preprocessed/april-21.csv') | |
df_april = read_csv('data/preprocessed/may-3.csv') | |
df_may.drop(columns=['readability'],axis=1,inplace=True) | |
df_may.drop(columns=['sentiment'],axis=1,inplace=True) | |
df_april.drop(columns=['readability'],axis=1,inplace=True) | |
df_april.drop(columns=['sentiment'],axis=1,inplace=True) | |
april_dataset = extract_features(df_april, gram, cluster) | |
X_train = april_dataset[:, 1:] | |
Y_train = april_dataset[:, 0] | |
may_dataset = extract_features(df_may, gram, cluster) | |
X_test = may_dataset[:, 1:] | |
Y_test = may_dataset[:,0] | |
return X_train, X_test, Y_train, Y_test | |
else: | |
if(option == 1): | |
df_combined = read_csv('data/preprocessed/april-21.csv') | |
df_combined.drop(columns=['readability'],axis=1,inplace=True) | |
df_combined.drop(columns=['sentiment'],axis=1,inplace=True) | |
dataset = extract_features(df_combined, gram, cluster) | |
elif(option == 2): | |
df_combined = read_csv('data/preprocessed/may-3.csv') | |
df_combined.drop(columns=['readability'],axis=1,inplace=True) | |
df_combined.drop(columns=['sentiment'],axis=1,inplace=True) | |
dataset = extract_features(df_combined, gram, cluster) | |
X = dataset[:, 1:] | |
Y = dataset[: ,0] | |
seed = 7 | |
test_size = 0.33 | |
return train_test_split(X, Y, test_size=test_size, random_state=seed) | |
def get_model_spec(model,X_train,X_test,Y_train,Y_test): | |
best_model = model() | |
oversample = RandomOverSampler(sampling_strategy='minority') | |
X_train, Y_train = oversample.fit_resample(X_train,Y_train) | |
best_model.fit(X_train,Y_train) | |
y_pred = best_model.predict(X_test) | |
#accuracy = round(accuracy_score(Y_test, predictions) * 100.0,3) | |
accuracy = accuracy_score(Y_test, y_pred) | |
print(Counter(y_pred)) | |
print(len(y_pred)) | |
print(len(Y_test)) | |
recall = recall_score(Y_test, y_pred, average='weighted') | |
print(recall) | |
precision = precision_score(Y_test, y_pred, average='weighted') | |
f1 = f1_score(Y_test, y_pred,average='weighted') | |
return accuracy, recall, precision, f1, best_model | |
def get_labelled_data(): | |
data = read_csv('data/raw/protest.csv',encoding='ISO-8859-1',usecols=["text","O/NO","S/NS","Y/N","description"]) | |
data.to_csv( | |
'data/raw/labelled.csv', | |
header=["text","O/NO","S/NS","Y/N","description"], | |
index=None) |