Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from pandas import read_csv
import config
import textstat
import profanity_check
def get_cluster(df,cluster):
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
tf_idf = tfidf_vectorizer.fit_transform(df['text'].values.astype('U'))
kmeans = KMeans(init="k-means++",
n_clusters=cluster,
n_init=30,
max_iter=500,
random_state=42).fit(tf_idf)
my_cluster = kmeans.labels_
return numpy.array([my_cluster])
def extract_features(df, Y):
X = df.drop(columns=['text'],axis=1).values
unscaled_features = numpy.concatenate((X,Y.T),axis=1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(unscaled_features)
return rescaled_features
def get_model_accuracy(X_train,X_test,Y_train,Y_test):
best_model = XGBClassifier(
min_child_weight=0.002,
gamma=0.001,
subsample=1.0,
colsample_bytree=0.01,
max_depth=35,
eta=0.3)
best_model.fit(X_train,Y_train)
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
return accuracy
possible_methods = {
"profanity_prob": lambda x: profanity_check.predict([x])[0],
"profanity_predict": lambda x: profanity_check.predict_prob([x])[0]
}
def funcHandler(func, test_data):
try:
return func(test_data)
except:
return 0
df = read_csv('data/preprocessed/april-21.csv')
df.drop(columns=['readability'],axis=1,inplace=True)
df.drop(columns=['sentiment'],axis=1,inplace=True)
cur_accuracy = 0.81117
useful = set()
for k in config.clusters:
cluster = get_cluster(df, k)
temp_best = 0
for method in possible_methods.keys():
print("Method: ", method)
func = possible_methods[method]
df['test_feature'] = df['text'].apply(lambda data: funcHandler(func, data))
print(df['test_feature'])
dataset = extract_features(df,cluster)
X = dataset[:, 1:]
Y = dataset[: ,0]
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=test_size, random_state=seed)
accuracy = get_model_accuracy(X_train,X_test,Y_train,Y_test)
print("Accuracy :", accuracy)
if(accuracy > cur_accuracy):
useful.add(cur_accuracy)
print("Useful features", useful)