Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, recall_score, precision_score, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from termcolor import colored
from collections import Counter
import pyinputplus as pyip
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy
from sklearn.preprocessing import MinMaxScaler
def get_user_menu_input(options,prompt):
answer = pyip.inputMenu(["All"] + options,lettered=True,prompt=colored("{prompt}\n".format(prompt=prompt),"blue"))
if(answer == "All"):
return options
return [answer]
def get_user_string_input(prompt):
return pyip.inputStr(colored("{prompt}\n".format(prompt=prompt),"blue"))
# Without topic, accuracy falls to 79%
# Without capitals, -1%
# Favorite, status,_count
def extract_features(df, gram, cluster):
X = df.drop(columns=['text'],axis=1).values
tfidf_vectorizer = TfidfVectorizer(ngram_range=(gram,gram))
tf_idf = tfidf_vectorizer.fit_transform(df['text'].values.astype('U'))
kmeans = KMeans(init="k-means++",
n_clusters=cluster,
n_init=30,
max_iter=500,
random_state=42).fit(tf_idf)
my_cluster = kmeans.labels_
Y = numpy.array([my_cluster])
unscaled_features = numpy.concatenate((X,Y.T),axis=1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(unscaled_features)
return rescaled_features
def get_data(option, gram, cluster):
if(option == 0):
df_may = read_csv('data/preprocessed/april-21.csv')
df_april = read_csv('data/preprocessed/may-3.csv')
df_may.drop(columns=['readability'],axis=1,inplace=True)
df_may.drop(columns=['sentiment'],axis=1,inplace=True)
df_april.drop(columns=['readability'],axis=1,inplace=True)
df_april.drop(columns=['sentiment'],axis=1,inplace=True)
april_dataset = extract_features(df_april, gram, cluster)
X_train = april_dataset[:, 1:]
Y_train = april_dataset[:, 0]
may_dataset = extract_features(df_may, gram, cluster)
X_test = may_dataset[:, 1:]
Y_test = may_dataset[:,0]
return X_train, X_test, Y_train, Y_test
else:
if(option == 1):
df_combined = read_csv('data/preprocessed/april-21.csv')
df_combined.drop(columns=['readability'],axis=1,inplace=True)
df_combined.drop(columns=['sentiment'],axis=1,inplace=True)
dataset = extract_features(df_combined, gram, cluster)
elif(option == 2):
df_combined = read_csv('data/preprocessed/may-3.csv')
df_combined.drop(columns=['readability'],axis=1,inplace=True)
df_combined.drop(columns=['sentiment'],axis=1,inplace=True)
dataset = extract_features(df_combined, gram, cluster)
X = dataset[:, 1:]
Y = dataset[: ,0]
seed = 7
test_size = 0.33
return train_test_split(X, Y, test_size=test_size, random_state=seed)
def get_model_spec(model,X_train,X_test,Y_train,Y_test):
best_model = model()
oversample = RandomOverSampler(sampling_strategy='minority')
X_train, Y_train = oversample.fit_resample(X_train,Y_train)
best_model.fit(X_train,Y_train)
y_pred = best_model.predict(X_test)
#accuracy = round(accuracy_score(Y_test, predictions) * 100.0,3)
accuracy = accuracy_score(Y_test, y_pred)
print(Counter(y_pred))
print(len(y_pred))
print(len(Y_test))
recall = recall_score(Y_test, y_pred, average='weighted')
print(recall)
precision = precision_score(Y_test, y_pred, average='weighted')
f1 = f1_score(Y_test, y_pred,average='weighted')
return accuracy, recall, precision, f1, best_model
def get_labelled_data():
data = read_csv('data/raw/protest.csv',encoding='ISO-8859-1',usecols=["text","O/NO","S/NS","Y/N","description"])
data.to_csv(
'data/raw/labelled.csv',
header=["text","O/NO","S/NS","Y/N","description"],
index=None)