preprocess.py

import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import csv
from spellchecker import SpellChecker
import pandas
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat

april_common_words = ['today', 'say', 'stay', 'take', 'like', 'amp', 'protester', 'gridlock', 'think', 'many', 'michigan', 'make', 'live', 'operationgridlock', 'need', 'people', 'get', 'protest', 'right', 'governor', 'state', 'would', 'order', 'one', 'home', 'trump', 'want', 'see', 'work', 'go', 'u', 'mi', 'lansing']
may_common_words = ['michiganprotest', 'like', 'people', 'u', 'whitmer', 'state', 'protester', 'michiganterrorists', 'go', 'arm', 'protest', 'gun', 'american', 'get', 'governor', 'amp', 'need', 'right', 'freedom', 'michigan', 'call', 'covid', 'government', 'patriot']
common_words = april_common_words + may_common_words
#nltk.download('wordnet')
cols = [
    'text',
    'Y/N',
    'favorite_count',
    'retweet_count',
    'statuses_count',
    'friends_count',
]
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()
stop_words = set(stopwords.words('english'))

def remove_link(txt):
    output = re.sub(r'https?://\S+', '', txt)
    return output

def remove_markup(text):
    # Capturing group 1: <
    # Capturing group 2: Any character but >
    # Capturing group 3: >
    return re.sub(r'(<+)([^>]*)(>+)','',text)    

def normalize_support_column(column):
    return 1 if column == 'Y' else 0

def remove_spaces(text):
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    my_str = _RE_COMBINE_WHITESPACE.sub(" ",text).strip()
    return my_str

def remove_nonletters(text):
    #Note: Should test including ! r'[^a-zA-Z!?]+'
    return re.sub(r'[^a-zA-Z\']+', ' ', text)

def remove_common_words(text):
    return [word for word in text if word not in april_common_words]

def acryonym(words):
    abbreviations = {
        "hasn't": "has not",
        "weren't": "were not",
        "wasn't":"was not",
        "don't":"do not",
        "won't":"will not",
        "'ve": 'have', 
        "'ll": 'will', 
        'tf':'the fuck', 
        'mfks':'mother fuckers', 
        'wh':'white house', 
        'lol':'laugh out loud', 
        "'m": "am"
        }
    # for initial in sorted(abbreviations.keys(), key=len, reverse= True):
    #     result = re.sub(initial, abbreviations[initial], t)
    # return result
    return [abbreviations[word] if word in abbreviations else word for word in words]

def autocorrect(words):
    words = " ".join(words).split(" ")
    misspelled = spell.unknown(words)
    wordDict = {}
    for word in misspelled:
        # Get the one `most likely` answer
        wordDict[word] = spell.correction(word)
    return  [wordDict[word] if word in wordDict else word for word in words]

def remove_tags(text):
    return re.sub(r'(\s)@\w+', r'\1', text)

def remove_stop(words):
    return [w for w in words if not w in stop_words]

def lemmatize(words):
    verbLemmatized = [lemmatizer.lemmatize(word,'v') for word in words]
    nounLemmatized = [lemmatizer.lemmatize(word,'n') for word in verbLemmatized]
    return nounLemmatized

def preprocess(text):
    sentencePreprocessFuncs = [
        remove_link,
        remove_tags,
        remove_markup,
        remove_nonletters,
        remove_spaces
    ]

    wordPreprocessFuncs = [
        acryonym,
        lemmatize,
        remove_common_words,
        remove_stop
        #autocorrect,
    ]

    text = text.lower()
    for sentenceFunc in sentencePreprocessFuncs:
        text = sentenceFunc(text)
    words = text.split(" ")

    for wordFunc in wordPreprocessFuncs:
        words = wordFunc(words)

    return " ".join(words)

def get_labelled_dataframe():
    return labelled_df

if __name__ == "__main__":
    df = pandas.read_csv('data/raw/april-21.csv',encoding='ISO-8859-1',usecols=cols)
    print("Length before:",len(df.text))
    df = df.loc[(df['Y/N'] == 'Y') | (df['Y/N'] == 'N')]
    cols_num = len(df.text)
    print("Length after:", cols_num)

    analyser = SentimentIntensityAnalyzer()
    df['Y/N'] = df['Y/N'].apply(normalize_support_column)
    df['hashtag_count'] = df['text'].apply(lambda x: len(x.split(" ")) if isinstance(x,str) else 0)
    df['capitals'] = df['text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['exclamation_marks'] = df['text'].apply(lambda x: x.count('!'))
    df['sentiment'] = df['text'].apply(lambda x: analyser.polarity_scores(x)['compound'])
    df['readability'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x))
    index = 0
    for i in range(cols_num):
        index += 1
        if(index % 100 == 0):
            print(index)
        try:
            df.text[i] = preprocess(df.text[i])
        except:
            print("Error processing at ",index)
            
    df.to_csv('data/preprocessed/april-21.csv',index=None)