Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import csv
from spellchecker import SpellChecker
import pandas
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat
april_common_words = ['today', 'say', 'stay', 'take', 'like', 'amp', 'protester', 'gridlock', 'think', 'many', 'michigan', 'make', 'live', 'operationgridlock', 'need', 'people', 'get', 'protest', 'right', 'governor', 'state', 'would', 'order', 'one', 'home', 'trump', 'want', 'see', 'work', 'go', 'u', 'mi', 'lansing']
may_common_words = ['michiganprotest', 'like', 'people', 'u', 'whitmer', 'state', 'protester', 'michiganterrorists', 'go', 'arm', 'protest', 'gun', 'american', 'get', 'governor', 'amp', 'need', 'right', 'freedom', 'michigan', 'call', 'covid', 'government', 'patriot']
common_words = april_common_words + may_common_words
#nltk.download('wordnet')
cols = [
'text',
'Y/N',
'favorite_count',
'retweet_count',
'statuses_count',
'friends_count',
]
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()
stop_words = set(stopwords.words('english'))
def remove_link(txt):
output = re.sub(r'https?://\S+', '', txt)
return output
def remove_markup(text):
# Capturing group 1: <
# Capturing group 2: Any character but >
# Capturing group 3: >
return re.sub(r'(<+)([^>]*)(>+)','',text)
def normalize_support_column(column):
return 1 if column == 'Y' else 0
def remove_spaces(text):
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
my_str = _RE_COMBINE_WHITESPACE.sub(" ",text).strip()
return my_str
def remove_nonletters(text):
#Note: Should test including ! r'[^a-zA-Z!?]+'
return re.sub(r'[^a-zA-Z\']+', ' ', text)
def remove_common_words(text):
return [word for word in text if word not in april_common_words]
def acryonym(words):
abbreviations = {
"hasn't": "has not",
"weren't": "were not",
"wasn't":"was not",
"don't":"do not",
"won't":"will not",
"'ve": 'have',
"'ll": 'will',
'tf':'the fuck',
'mfks':'mother fuckers',
'wh':'white house',
'lol':'laugh out loud',
"'m": "am"
}
# for initial in sorted(abbreviations.keys(), key=len, reverse= True):
# result = re.sub(initial, abbreviations[initial], t)
# return result
return [abbreviations[word] if word in abbreviations else word for word in words]
def autocorrect(words):
words = " ".join(words).split(" ")
misspelled = spell.unknown(words)
wordDict = {}
for word in misspelled:
# Get the one `most likely` answer
wordDict[word] = spell.correction(word)
return [wordDict[word] if word in wordDict else word for word in words]
def remove_tags(text):
return re.sub(r'(\s)@\w+', r'\1', text)
def remove_stop(words):
return [w for w in words if not w in stop_words]
def lemmatize(words):
verbLemmatized = [lemmatizer.lemmatize(word,'v') for word in words]
nounLemmatized = [lemmatizer.lemmatize(word,'n') for word in verbLemmatized]
return nounLemmatized
def preprocess(text):
sentencePreprocessFuncs = [
remove_link,
remove_tags,
remove_markup,
remove_nonletters,
remove_spaces
]
wordPreprocessFuncs = [
acryonym,
lemmatize,
remove_common_words,
remove_stop
#autocorrect,
]
text = text.lower()
for sentenceFunc in sentencePreprocessFuncs:
text = sentenceFunc(text)
words = text.split(" ")
for wordFunc in wordPreprocessFuncs:
words = wordFunc(words)
return " ".join(words)
def get_labelled_dataframe():
return labelled_df
if __name__ == "__main__":
df = pandas.read_csv('data/raw/april-21.csv',encoding='ISO-8859-1',usecols=cols)
print("Length before:",len(df.text))
df = df.loc[(df['Y/N'] == 'Y') | (df['Y/N'] == 'N')]
cols_num = len(df.text)
print("Length after:", cols_num)
analyser = SentimentIntensityAnalyzer()
df['Y/N'] = df['Y/N'].apply(normalize_support_column)
df['hashtag_count'] = df['text'].apply(lambda x: len(x.split(" ")) if isinstance(x,str) else 0)
df['capitals'] = df['text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
df['exclamation_marks'] = df['text'].apply(lambda x: x.count('!'))
df['sentiment'] = df['text'].apply(lambda x: analyser.polarity_scores(x)['compound'])
df['readability'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x))
index = 0
for i in range(cols_num):
index += 1
if(index % 100 == 0):
print(index)
try:
df.text[i] = preprocess(df.text[i])
except:
print("Error processing at ",index)
df.to_csv('data/preprocessed/april-21.csv',index=None)