indeedScrape.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using the information from [this link](https://github.com/aakashtandel/Web-Scraping-Indeed/blob/master/Code/Project%203%20-%20Web%20Scraping%20Indeed%20Job%20Listings%20Jupyter%20Notebook.ipynb) we extract data from indeed\n",
    "\n",
    "#### The point of this notebook is to practice webscraping using beautiful soup, in particular to take data from indeed for use in analysis. We would eventually like to collect the ent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "URL = \"https://www.indeed.com/jobs?q=data+scientist&l=New+York%2C+NY&start=10\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib\n",
    "import requests\n",
    "import bs4\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse(url):\n",
    "    html = requests.get(url)\n",
    "    soup = BeautifulSoup(html.content, 'html.parser', from_encoding=\"utf-8\")\n",
    "    df = pd.DataFrame(columns=[\"Title\",\"Location\",\"Company\",\"Salary\", \"Synopsis\"])\n",
    "    for each in soup.find_all(class_= \"result\" ):\n",
    "        try: \n",
    "            title = each.find(class_='jobtitle').text.replace('\\n', '')\n",
    "        except:\n",
    "            title = 'None'\n",
    "        try:\n",
    "            location = each.find('span', {'class':\"location\" }).text.replace('\\n', '')\n",
    "        except:\n",
    "            location = 'None'\n",
    "        try: \n",
    "            company = each.find(class_='company').text.replace('\\n', '')\n",
    "        except:\n",
    "            company = 'None'\n",
    "        try:\n",
    "            salary = each.find('span', {'class':'no-wrap'}).text\n",
    "        except:\n",
    "            salary = 'None'\n",
    "        synopsis = each.find('span', {'class':'summary'}).text.replace('\\n', '')\n",
    "        df = df.append({'Title':title, 'Location':location, 'Company':company, 'Salary':salary, 'Synopsis':synopsis}, ignore_index=True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Title</th>\n",
       "      <th>Location</th>\n",
       "      <th>Company</th>\n",
       "      <th>Salary</th>\n",
       "      <th>Synopsis</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Principal Data Scientist</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>Foot Locker</td>\n",
       "      <td>\\n                $200,000 - $245,000 a year</td>\n",
       "      <td>The Principal Data Scientist leads...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Applied Scientist</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>Amazon.com</td>\n",
       "      <td>None</td>\n",
       "      <td>Applied Scientist who has deep kno...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Data Analyst Intern to Full Time Hire</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>BuyerGenomics</td>\n",
       "      <td>None</td>\n",
       "      <td>Customer analytics, digital analyt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Sr. Data Scientist</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>Amazon.com</td>\n",
       "      <td>None</td>\n",
       "      <td>3+ years of relevant experience in...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Data Scientist</td>\n",
       "      <td>Parsippany, NJ 07054</td>\n",
       "      <td>Mondelez International</td>\n",
       "      <td>None</td>\n",
       "      <td>Manages full data portfolio of spe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Data Scientist</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>Dataminr</td>\n",
       "      <td>None</td>\n",
       "      <td>You're a dedicated Data Scientist ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Data Scientist (machine learning)</td>\n",
       "      <td>New York, NY 10036</td>\n",
       "      <td>The New York Times</td>\n",
       "      <td>None</td>\n",
       "      <td>The Times seeks a Data Scientist t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Data Scientist I</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>AIG</td>\n",
       "      <td>None</td>\n",
       "      <td>Data Scientists collaborate with AIG’s Life an...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Data Scientist - Global Investment Banking - A...</td>\n",
       "      <td>New York, NY 10179 (Midtown area)</td>\n",
       "      <td>JP Morgan Chase</td>\n",
       "      <td>None</td>\n",
       "      <td>We have a great opportunity within...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Data Scientist</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>LexisNexis</td>\n",
       "      <td>None</td>\n",
       "      <td>This will require application of m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>AI Research Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Bloomberg</td>\n",
       "      <td>None</td>\n",
       "      <td>The infrastructure...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Data Analytics Specialist</td>\n",
       "      <td>None</td>\n",
       "      <td>SquarePeg Hires</td>\n",
       "      <td>\\n                $40,000 - $70,000 a year</td>\n",
       "      <td>For our Manhattan ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Data Scientist, Retention</td>\n",
       "      <td>None</td>\n",
       "      <td>DISNEY</td>\n",
       "      <td>None</td>\n",
       "      <td>Graduate Degree De...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Associate Data Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Church Pension Group</td>\n",
       "      <td>None</td>\n",
       "      <td>The Associate Data...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Data Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Liberty Lending</td>\n",
       "      <td>None</td>\n",
       "      <td>Conduct data minin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Data Scientist I</td>\n",
       "      <td>None</td>\n",
       "      <td>AIG</td>\n",
       "      <td>None</td>\n",
       "      <td>Data Scientists collaborate with AIG’s Life an...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Title  \\\n",
       "0                            Principal Data Scientist   \n",
       "1                                   Applied Scientist   \n",
       "2               Data Analyst Intern to Full Time Hire   \n",
       "3                                  Sr. Data Scientist   \n",
       "4                                      Data Scientist   \n",
       "5                                      Data Scientist   \n",
       "6                   Data Scientist (machine learning)   \n",
       "7                                    Data Scientist I   \n",
       "8   Data Scientist - Global Investment Banking - A...   \n",
       "9                                      Data Scientist   \n",
       "10                              AI Research Scientist   \n",
       "11                          Data Analytics Specialist   \n",
       "12                          Data Scientist, Retention   \n",
       "13                           Associate Data Scientist   \n",
       "14                                     Data Scientist   \n",
       "15                                   Data Scientist I   \n",
       "\n",
       "                             Location                         Company  \\\n",
       "0                        New York, NY                     Foot Locker   \n",
       "1                        New York, NY                      Amazon.com   \n",
       "2                        New York, NY                   BuyerGenomics   \n",
       "3                        New York, NY                      Amazon.com   \n",
       "4                Parsippany, NJ 07054          Mondelez International   \n",
       "5                        New York, NY                        Dataminr   \n",
       "6                  New York, NY 10036              The New York Times   \n",
       "7                        New York, NY                             AIG   \n",
       "8   New York, NY 10179 (Midtown area)                 JP Morgan Chase   \n",
       "9                        New York, NY                      LexisNexis   \n",
       "10                               None                       Bloomberg   \n",
       "11                               None                 SquarePeg Hires   \n",
       "12                               None                          DISNEY   \n",
       "13                               None            Church Pension Group   \n",
       "14                               None                 Liberty Lending   \n",
       "15                               None                             AIG   \n",
       "\n",
       "                                          Salary  \\\n",
       "0   \\n                $200,000 - $245,000 a year   \n",
       "1                                           None   \n",
       "2                                           None   \n",
       "3                                           None   \n",
       "4                                           None   \n",
       "5                                           None   \n",
       "6                                           None   \n",
       "7                                           None   \n",
       "8                                           None   \n",
       "9                                           None   \n",
       "10                                          None   \n",
       "11    \\n                $40,000 - $70,000 a year   \n",
       "12                                          None   \n",
       "13                                          None   \n",
       "14                                          None   \n",
       "15                                          None   \n",
       "\n",
       "                                             Synopsis  \n",
       "0               The Principal Data Scientist leads...  \n",
       "1               Applied Scientist who has deep kno...  \n",
       "2               Customer analytics, digital analyt...  \n",
       "3               3+ years of relevant experience in...  \n",
       "4               Manages full data portfolio of spe...  \n",
       "5               You're a dedicated Data Scientist ...  \n",
       "6               The Times seeks a Data Scientist t...  \n",
       "7   Data Scientists collaborate with AIG’s Life an...  \n",
       "8               We have a great opportunity within...  \n",
       "9               This will require application of m...  \n",
       "10                              The infrastructure...  \n",
       "11                              For our Manhattan ...  \n",
       "12                              Graduate Degree De...  \n",
       "13                              The Associate Data...  \n",
       "14                              Conduct data minin...  \n",
       "15  Data Scientists collaborate with AIG’s Life an...  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parse(URL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "frames=[]\n",
    "for i in range(10, 100, 10):\n",
    "    placeholder = parse(URL)\n",
    "    frames.append(placeholder)\n",
    "    number1= i\n",
    "    number2 = i+10\n",
    "    URL.replace(str(number1), str(number2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.indeed.com/jobs?q=data+scientist&l=New+York%2C+NY&start=20'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "URL.replace(\"10\", \"20\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "indeed = pd.concat(frames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\n",
      "20\n",
      "30\n",
      "40\n",
      "50\n",
      "60\n",
      "70\n",
      "80\n",
      "90\n",
      "100\n"
     ]
    }
   ],
   "source": [
    "for i in range(10, 110, 10):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Title</th>\n",
       "      <th>Location</th>\n",
       "      <th>Company</th>\n",
       "      <th>Salary</th>\n",
       "      <th>Synopsis</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Associate Data Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Church Pension Group</td>\n",
       "      <td>None</td>\n",
       "      <td>The Associate Data...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Data Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Liberty Lending</td>\n",
       "      <td>None</td>\n",
       "      <td>Conduct data minin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Data Analytics Specialist</td>\n",
       "      <td>None</td>\n",
       "      <td>SquarePeg Hires</td>\n",
       "      <td>\\n                $40,000 - $70,000 a year</td>\n",
       "      <td>For our Manhattan ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AI Research Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Bloomberg</td>\n",
       "      <td>None</td>\n",
       "      <td>The infrastructure...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NBCUniversal Summer 2019 Data Science Internsh...</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>NBCUniversal</td>\n",
       "      <td>None</td>\n",
       "      <td>Unlike a traditional internship, t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Junior Data Scientist</td>\n",
       "      <td>New York, NY 10176 (Murray Hill area)</td>\n",
       "      <td>Dow Jones</td>\n",
       "      <td>None</td>\n",
       "      <td>You have worked with visualization...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Data Analyst Intern to Full Time Hire</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>BuyerGenomics</td>\n",
       "      <td>None</td>\n",
       "      <td>Customer analytics, digital analyt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Junior Data Scientist</td>\n",
       "      <td>New York, NY 10036</td>\n",
       "      <td>Viacom</td>\n",
       "      <td>None</td>\n",
       "      <td>Knowledge of algorithms for data m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Data Scientist</td>\n",
       "      <td>Jersey City, NJ 07302 (Downtown area)</td>\n",
       "      <td>Bank of America</td>\n",
       "      <td>None</td>\n",
       "      <td>Data Science, Machine Learning, Data Analytics...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Data Analyst / Scientist</td>\n",
       "      <td>New York, NY 10038 (Financial District area)</td>\n",
       "      <td>Bank of America</td>\n",
       "      <td>None</td>\n",
       "      <td>Advanced knowledge and experience ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Data Science Intern (2019)</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>Chegg</td>\n",
       "      <td>None</td>\n",
       "      <td>Some knowledge of statistics, econ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Data Science Undergraduate Internship</td>\n",
       "      <td>New York, NY 10016 (Gramercy area)</td>\n",
       "      <td>Aetna</td>\n",
       "      <td>None</td>\n",
       "      <td>Use modeling, machine learning and...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Data Scientist (Product)</td>\n",
       "      <td>New York, NY 10011 (Chelsea area)</td>\n",
       "      <td>Spotify</td>\n",
       "      <td>None</td>\n",
       "      <td>As a Data Scientist, our mission i...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2019 Summer Internship - Statcast Data Analytics</td>\n",
       "      <td>New York, NY</td>\n",
       "      <td>Major League Baseball</td>\n",
       "      <td>None</td>\n",
       "      <td>Proficiency in SQL, R, or Python. ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Data Scientist I</td>\n",
       "      <td>None</td>\n",
       "      <td>AIG</td>\n",
       "      <td>None</td>\n",
       "      <td>Data Scientists collaborate with AIG’s Life an...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Data Scientist, Retention</td>\n",
       "      <td>None</td>\n",
       "      <td>DISNEY</td>\n",
       "      <td>None</td>\n",
       "      <td>Graduate Degree De...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Associate Data Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Church Pension Group</td>\n",
       "      <td>None</td>\n",
       "      <td>The Associate Data...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Data Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Liberty Lending</td>\n",
       "      <td>None</td>\n",
       "      <td>Conduct data minin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>AI Research Scientist</td>\n",
       "      <td>None</td>\n",
       "      <td>Bloomberg</td>\n",
       "      <td>None</td>\n",
       "      <td>The infrastructure...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Data Scientist, Analytics, University Grad</td>\n",
       "      <td>New York, NY 10017 (Midtown area)</td>\n",
       "      <td>Facebook</td>\n",
       "      <td>None</td>\n",
       "      <td>The Analytics team is looking for ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Title  \\\n",
       "0                            Associate Data Scientist   \n",
       "1                                      Data Scientist   \n",
       "2                           Data Analytics Specialist   \n",
       "3                               AI Research Scientist   \n",
       "4   NBCUniversal Summer 2019 Data Science Internsh...   \n",
       "5                               Junior Data Scientist   \n",
       "6               Data Analyst Intern to Full Time Hire   \n",
       "7                               Junior Data Scientist   \n",
       "8                                      Data Scientist   \n",
       "9                            Data Analyst / Scientist   \n",
       "10                         Data Science Intern (2019)   \n",
       "11              Data Science Undergraduate Internship   \n",
       "12                           Data Scientist (Product)   \n",
       "13   2019 Summer Internship - Statcast Data Analytics   \n",
       "14                                   Data Scientist I   \n",
       "15                          Data Scientist, Retention   \n",
       "16                           Associate Data Scientist   \n",
       "17                                     Data Scientist   \n",
       "18                              AI Research Scientist   \n",
       "19         Data Scientist, Analytics, University Grad   \n",
       "\n",
       "                                        Location  \\\n",
       "0                                           None   \n",
       "1                                           None   \n",
       "2                                           None   \n",
       "3                                           None   \n",
       "4                                   New York, NY   \n",
       "5          New York, NY 10176 (Murray Hill area)   \n",
       "6                                   New York, NY   \n",
       "7                             New York, NY 10036   \n",
       "8          Jersey City, NJ 07302 (Downtown area)   \n",
       "9   New York, NY 10038 (Financial District area)   \n",
       "10                                  New York, NY   \n",
       "11            New York, NY 10016 (Gramercy area)   \n",
       "12             New York, NY 10011 (Chelsea area)   \n",
       "13                                  New York, NY   \n",
       "14                                          None   \n",
       "15                                          None   \n",
       "16                                          None   \n",
       "17                                          None   \n",
       "18                                          None   \n",
       "19             New York, NY 10017 (Midtown area)   \n",
       "\n",
       "                          Company                                      Salary  \\\n",
       "0            Church Pension Group                                        None   \n",
       "1                 Liberty Lending                                        None   \n",
       "2                 SquarePeg Hires  \\n                $40,000 - $70,000 a year   \n",
       "3                       Bloomberg                                        None   \n",
       "4                    NBCUniversal                                        None   \n",
       "5                       Dow Jones                                        None   \n",
       "6                   BuyerGenomics                                        None   \n",
       "7                          Viacom                                        None   \n",
       "8                 Bank of America                                        None   \n",
       "9                 Bank of America                                        None   \n",
       "10                          Chegg                                        None   \n",
       "11                          Aetna                                        None   \n",
       "12                        Spotify                                        None   \n",
       "13          Major League Baseball                                        None   \n",
       "14                            AIG                                        None   \n",
       "15                         DISNEY                                        None   \n",
       "16           Church Pension Group                                        None   \n",
       "17                Liberty Lending                                        None   \n",
       "18                      Bloomberg                                        None   \n",
       "19                       Facebook                                        None   \n",
       "\n",
       "                                             Synopsis  \n",
       "0                               The Associate Data...  \n",
       "1                               Conduct data minin...  \n",
       "2                               For our Manhattan ...  \n",
       "3                               The infrastructure...  \n",
       "4               Unlike a traditional internship, t...  \n",
       "5               You have worked with visualization...  \n",
       "6               Customer analytics, digital analyt...  \n",
       "7               Knowledge of algorithms for data m...  \n",
       "8   Data Science, Machine Learning, Data Analytics...  \n",
       "9               Advanced knowledge and experience ...  \n",
       "10              Some knowledge of statistics, econ...  \n",
       "11              Use modeling, machine learning and...  \n",
       "12              As a Data Scientist, our mission i...  \n",
       "13              Proficiency in SQL, R, or Python. ...  \n",
       "14  Data Scientists collaborate with AIG’s Life an...  \n",
       "15                              Graduate Degree De...  \n",
       "16                              The Associate Data...  \n",
       "17                              Conduct data minin...  \n",
       "18                              The infrastructure...  \n",
       "19              The Analytics team is looking for ...  "
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indeed.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            Conduct data mining from internal and external data sources. As a Data Scientist, you will use statistical analysis and data mining techniques to optimize risk...\n"
     ]
    }
   ],
   "source": [
    "print(indeed[\"Synopsis\"][1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Note that the above describtion is incomplete considering we are primarily interested in full job descriptions. \n",
    "\n",
    "### We now follow the guiding of [This link](https://towardsdatascience.com/scraping-job-posting-data-from-indeed-using-selenium-and-beautifulsoup-dfc86230baac) to help us \"click\" through the website to pull full job descriptions. Below we import selenium which is what allows us to click around indeed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def get_soup(url):\n",
    "    driver = webdriver.Safari()\n",
    "    driver.get(url)\n",
    "    html = driver.page_source\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "    driver.close()\n",
    "    return soup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_posting(url):\n",
    "    soup = get_soup(url)\n",
    "    title = soup.find(name='h3').getText().lower()\n",
    "    posting = soup.find(name='div', attrs={'class': \"jobsearch-JobComponent\"}).get_text()\n",
    "    return title, posting.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'soup' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-17-b18ccbe2a45e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0murls\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'div'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"searchCount\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'soup' is not defined"
     ]
    }
   ],
   "source": [
    "def grab_job_links(soup):\n",
    "    urls = []\n",
    "    for link in soup.find_all('h2', {'class': 'jobtitle'}):\n",
    "        partial_url = link.a.get('href')\n",
    "        url = 'https://ca.indeed.com' + partial_url\n",
    "        urls.append(url)\n",
    "    return urls\n",
    "\n",
    "soup.find(name='div', attrs={'id':\"searchCount\"}).get_text()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'NoneType' object has no attribute 'getText'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-19-32107e3b98c4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_posting\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mURL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-14-065e12631732>\u001b[0m in \u001b[0;36mget_posting\u001b[0;34m(url)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_posting\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_soup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mtitle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'h3'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetText\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0mposting\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'div'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'class'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"jobsearch-JobComponent\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mtitle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mposting\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'getText'"
     ]
    }
   ],
   "source": [
    "get_posting(URL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}