Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
binary-classification-of-tweets-about-protest/preprocess.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
148 lines (126 sloc)
4.81 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
import nltk | |
import csv | |
from spellchecker import SpellChecker | |
import pandas | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
import textstat | |
april_common_words = ['today', 'say', 'stay', 'take', 'like', 'amp', 'protester', 'gridlock', 'think', 'many', 'michigan', 'make', 'live', 'operationgridlock', 'need', 'people', 'get', 'protest', 'right', 'governor', 'state', 'would', 'order', 'one', 'home', 'trump', 'want', 'see', 'work', 'go', 'u', 'mi', 'lansing'] | |
may_common_words = ['michiganprotest', 'like', 'people', 'u', 'whitmer', 'state', 'protester', 'michiganterrorists', 'go', 'arm', 'protest', 'gun', 'american', 'get', 'governor', 'amp', 'need', 'right', 'freedom', 'michigan', 'call', 'covid', 'government', 'patriot'] | |
common_words = april_common_words + may_common_words | |
#nltk.download('wordnet') | |
cols = [ | |
'text', | |
'Y/N', | |
'favorite_count', | |
'retweet_count', | |
'statuses_count', | |
'friends_count', | |
] | |
lemmatizer = WordNetLemmatizer() | |
spell = SpellChecker() | |
stop_words = set(stopwords.words('english')) | |
def remove_link(txt): | |
output = re.sub(r'https?://\S+', '', txt) | |
return output | |
def remove_markup(text): | |
# Capturing group 1: < | |
# Capturing group 2: Any character but > | |
# Capturing group 3: > | |
return re.sub(r'(<+)([^>]*)(>+)','',text) | |
def normalize_support_column(column): | |
return 1 if column == 'Y' else 0 | |
def remove_spaces(text): | |
_RE_COMBINE_WHITESPACE = re.compile(r"\s+") | |
my_str = _RE_COMBINE_WHITESPACE.sub(" ",text).strip() | |
return my_str | |
def remove_nonletters(text): | |
#Note: Should test including ! r'[^a-zA-Z!?]+' | |
return re.sub(r'[^a-zA-Z\']+', ' ', text) | |
def remove_common_words(text): | |
return [word for word in text if word not in april_common_words] | |
def acryonym(words): | |
abbreviations = { | |
"hasn't": "has not", | |
"weren't": "were not", | |
"wasn't":"was not", | |
"don't":"do not", | |
"won't":"will not", | |
"'ve": 'have', | |
"'ll": 'will', | |
'tf':'the fuck', | |
'mfks':'mother fuckers', | |
'wh':'white house', | |
'lol':'laugh out loud', | |
"'m": "am" | |
} | |
# for initial in sorted(abbreviations.keys(), key=len, reverse= True): | |
# result = re.sub(initial, abbreviations[initial], t) | |
# return result | |
return [abbreviations[word] if word in abbreviations else word for word in words] | |
def autocorrect(words): | |
words = " ".join(words).split(" ") | |
misspelled = spell.unknown(words) | |
wordDict = {} | |
for word in misspelled: | |
# Get the one `most likely` answer | |
wordDict[word] = spell.correction(word) | |
return [wordDict[word] if word in wordDict else word for word in words] | |
def remove_tags(text): | |
return re.sub(r'(\s)@\w+', r'\1', text) | |
def remove_stop(words): | |
return [w for w in words if not w in stop_words] | |
def lemmatize(words): | |
verbLemmatized = [lemmatizer.lemmatize(word,'v') for word in words] | |
nounLemmatized = [lemmatizer.lemmatize(word,'n') for word in verbLemmatized] | |
return nounLemmatized | |
def preprocess(text): | |
sentencePreprocessFuncs = [ | |
remove_link, | |
remove_tags, | |
remove_markup, | |
remove_nonletters, | |
remove_spaces | |
] | |
wordPreprocessFuncs = [ | |
acryonym, | |
lemmatize, | |
remove_common_words, | |
remove_stop | |
#autocorrect, | |
] | |
text = text.lower() | |
for sentenceFunc in sentencePreprocessFuncs: | |
text = sentenceFunc(text) | |
words = text.split(" ") | |
for wordFunc in wordPreprocessFuncs: | |
words = wordFunc(words) | |
return " ".join(words) | |
def get_labelled_dataframe(): | |
return labelled_df | |
if __name__ == "__main__": | |
df = pandas.read_csv('data/raw/april-21.csv',encoding='ISO-8859-1',usecols=cols) | |
print("Length before:",len(df.text)) | |
df = df.loc[(df['Y/N'] == 'Y') | (df['Y/N'] == 'N')] | |
cols_num = len(df.text) | |
print("Length after:", cols_num) | |
analyser = SentimentIntensityAnalyzer() | |
df['Y/N'] = df['Y/N'].apply(normalize_support_column) | |
df['hashtag_count'] = df['text'].apply(lambda x: len(x.split(" ")) if isinstance(x,str) else 0) | |
df['capitals'] = df['text'].apply(lambda comment: sum(1 for c in comment if c.isupper())) | |
df['exclamation_marks'] = df['text'].apply(lambda x: x.count('!')) | |
df['sentiment'] = df['text'].apply(lambda x: analyser.polarity_scores(x)['compound']) | |
df['readability'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x)) | |
index = 0 | |
for i in range(cols_num): | |
index += 1 | |
if(index % 100 == 0): | |
print(index) | |
try: | |
df.text[i] = preprocess(df.text[i]) | |
except: | |
print("Error processing at ",index) | |
df.to_csv('data/preprocessed/april-21.csv',index=None) |