Untitled.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import read_csv, DataFrame\n",
    "from sklearn.model_selection import train_test_split\n",
    "import language_tool_python\n",
    "\n",
    "#import textstat\n",
    "#from imblearn.over_sampling import RandomOverSampler\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.svm import SVC\n",
    "import pandas as pd\n",
    "from spellchecker import SpellChecker\n",
    "\n",
    "from yellowbrick.cluster import KElbowVisualizer\n",
    "\n",
    "  \n",
    "\"\"\"\n",
    "# Mention the language keyword \n",
    "tool = language_check.LanguageTool('en-US') \n",
    "def get_data(df,sampling_method=\"None\"):\n",
    "    dataset = df.values\n",
    "\n",
    "    X = dataset[:, 1:]\n",
    "    Y = dataset[:,0]\n",
    "    seed = 7\n",
    "    test_size = 0.33\n",
    "    train_size = 0.67\n",
    "\n",
    "    oversample = RandomOverSampler(sampling_strategy='all')\n",
    "    X, Y = oversample.fit_resample(X,Y)\n",
    "\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed, train_size=train_size)\n",
    "    calc(X_train,X_test,Y_train, Y_test)\n",
    "def calc( X_train,X_test,Y_train,Y_test):\n",
    "    best_model = XGBClassifier(\n",
    "                                min_child_weight=1,\n",
    "                                gamma=0.1,\n",
    "                                subsample=1.0,\n",
    "                                colsample_bytree=1.0,\n",
    "                                max_depth=20,\n",
    "                                eta=1)\n",
    "    \n",
    "    best_model.fit(X_train,Y_train)\n",
    "\n",
    "    y_pred = best_model.predict(X_test)\n",
    "    predictions = [round(value) for value in y_pred]\n",
    "    #accuracy = round(accuracy_score(Y_test, predictions) * 100.0,3)\n",
    "    accuracy = accuracy_score(Y_test, y_pred)\n",
    "    print(accuracy)\n",
    "\"\"\"\n",
    "tool = language_tool_python.LanguageTool('en-US')\n",
    "file_path = 'data/extracted/normalized/2-gram-4-clusters.csv'\n",
    "spell = SpellChecker()\n",
    "preprocessed = read_csv('data/preprocessed/april-21.csv')\n",
    "df = read_csv('data/raw/april-21.csv',encoding='ISO-8859-1')\n",
    "small_df = df\n",
    "def process(text):\n",
    "    #spell.unknown(text.split(\" \"))\n",
    "    try:\n",
    "        \n",
    "        return len(tool.check(text))\n",
    "    except:\n",
    "        print(\"ERROR\")\n",
    "        print(text)\n",
    "        return 0\n",
    "\n",
    "\n",
    "small_df['spelling_errors'] =  small_df['text'].apply(process)\n",
    "yes_df = small_df[small_df['Y/N'] == 'Y']\n",
    "no_df = small_df[small_df['Y/N'] == 'N']\n",
    "print(yes_df['spelling_errors'].mean())\n",
    "print(no_df['spelling_errors'].mean())\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "#df['readability'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x))\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'df[\\'word_count\\'] = df[\\'text\\'].apply(lambda x : len(x.split()))\\ndf[\\'char_count\\'] = df[\\'text\\'].apply(lambda x : len(x.replace(\" \",\"\")))\\ndf[\\'total_length\\'] = df[\\'text\\'].apply(len)\\n\\n\\nfor each in [\\'word_count\\',\\'char_count\\',\\'total_length\\']:\\n    print(yes_df[each].mean())\\n    print(no_df[each].mean())\\n    \\ndf[\\'hashtag_count\\'] = df[\\'hashtags\\'].apply(lambda x: len(x.split(\" \")) if isinstance(x,str) else 0)\\ndf[\\'num_unique_words\\'] = df[\\'text\\'].apply(lambda x: len(set(w for w in x.split())))\\ndf[\\'capitals\\'] = df[\\'text\\'].apply(lambda comment: sum(1 for c in comment if c.isupper()))\\ndf[\\'num_exclamation_marks\\'] =df[\\'text\\'].apply(lambda x: x.count(\\'!\\'))\\ndf[\\'num_question_marks\\'] = df[\\'text\\'].apply(lambda x: x.count(\\'?\\'))\\ndf[\\'num_punctuation\\'] = df[\\'text\\'].apply(lambda x: sum(x.count(w) for w in \\'.,;:\\'))\\ndf[\\'num_symbols\\'] = df[\\'text\\'].apply(lambda x: sum(x.count(w) for w in \\'*&$%\\'))\\n\\n\\n\\nfor each in [\\'hashtag_count\\',\\'num_unique_words\\',\\'capitals\\',\\'num_exclamation_marks\\',\\'num_question_marks\\',\\'num_punctuation\\',\\'num_symbols\\']:\\n    print(each)\\n    print(\"SUPPORT:\", yes_df[each].mean())\\n    print(\"NOT SUPPORITING: \",no_df[each].mean())\\n    '"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "\n",
    "\n",
    "\n",
    "\"\"\"df['word_count'] = df['text'].apply(lambda x : len(x.split()))\n",
    "df['char_count'] = df['text'].apply(lambda x : len(x.replace(\" \",\"\")))\n",
    "df['total_length'] = df['text'].apply(len)\n",
    "\n",
    "\n",
    "for each in ['word_count','char_count','total_length']:\n",
    "    print(yes_df[each].mean())\n",
    "    print(no_df[each].mean())\n",
    "    \n",
    "df['hashtag_count'] = df['hashtags'].apply(lambda x: len(x.split(\" \")) if isinstance(x,str) else 0)\n",
    "df['num_unique_words'] = df['text'].apply(lambda x: len(set(w for w in x.split())))\n",
    "df['capitals'] = df['text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))\n",
    "df['num_exclamation_marks'] =df['text'].apply(lambda x: x.count('!'))\n",
    "df['num_question_marks'] = df['text'].apply(lambda x: x.count('?'))\n",
    "df['num_punctuation'] = df['text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))\n",
    "df['num_symbols'] = df['text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))\n",
    "\n",
    "\n",
    "\n",
    "for each in ['hashtag_count','num_unique_words','capitals','num_exclamation_marks','num_question_marks','num_punctuation','num_symbols']:\n",
    "    print(each)\n",
    "    print(\"SUPPORT:\", yes_df[each].mean())\n",
    "    print(\"NOT SUPPORITING: \",no_df[each].mean())\n",
    "    \"\"\"\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('data/preprocessed/without_stop_words.csv', encoding='ISO-8859-1')\n",
    "\n",
    "def count_if_exist(corpus, word):\n",
    "    word_count = corpus.get(word,0) + 1\n",
    "    corpus[word] = word_count\n",
    "\n",
    "def convert_dict_to_csv(input_dict,name):\n",
    "    pd.DataFrame(input_dict.items()).to_csv(\"./data/computed/{name}.csv\".format(name=name),header=None,index=None)\n",
    "general_word_count = {}\n",
    "supporting_word_count = {} \n",
    "non_supporting_word_count = {}\n",
    "word_support_probability = {}\n",
    "for i in range(1,len(df)):\n",
    "    cur_row = df.iloc[i]\n",
    "    words = cur_row.text.split(' ')\n",
    "    for word in words:\n",
    "        count_if_exist(general_word_count,word)\n",
    "        if(cur_row['Y/N'] == 'Y'):\n",
    "            count_if_exist(supporting_word_count,word)\n",
    "        else:\n",
    "            count_if_exist(non_supporting_word_count,word)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['today', 'say', 'stay', 'take', 'like', 'amp', 'protester', 'gridlock', 'think', 'many', 'michigan', 'make', 'live', 'operationgridlock', 'need', 'people', 'get', 'protest', 'right', 'governor', 'state', 'would', 'order', 'one', 'home', 'trump', 'want', 'see', 'work', 'go', 'u', 'mi', 'lansing']\n"
     ]
    }
   ],
   "source": [
    "supporting_pair = sorted(supporting_word_count.items(),key=lambda x: x[1],reverse=True)\n",
    "non_supporting_pair =  sorted(non_supporting_word_count.items(),key=lambda x: x[1],reverse=True)\n",
    "supporting_words = list(map(lambda x: x[0], supporting_pair))[:50]\n",
    "non_supporting_words = list(map(lambda x: x[0], non_supporting_pair))[:50]\n",
    "intersection = list(set(supporting_words) & set(non_supporting_words))\n",
    "print(intersection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_dict_to_csv(input_dict,name):\n",
    "    pd.DataFrame(input_dict.items()).to_csv(\"./data/computed/{name}.csv\".format(name=name),header=['count'],index=['readability'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}