dplyr_tidyr.Rmd

---
title: "R Notebook"
output: html_notebook
---


```{r}
library(dplyr)
library(gapminder)
data("gapminder")
attach(gapminder)

head(gapminder)
```
Gapminder data loaded. Next: manipulation.
```{r}
# Shortening variable name and removing old variable.
gap <- gapminder
rm(gapminder)

mean.gdp.africa <- mean(gap$gdpPercap[continent == "Africa"])
mean.gdp.africa
mean.gdp.asia <- mean(gap$gdpPercap[continent == "Asia"])
mean.gdp.asia
mean.gdp.americas <- mean(gap$gdpPercap[continent == "Americas"])
mean.gdp.americas

#mean(gap[gap$continent == "Africa", "gdpPercap"])

```

### Working with data using dplyr
select() - subset variables (columns)
filter() - subset observations (rows)
group_by() - counts and groups data
summarize() - useful information about data
mutate() - add a new column based on the data

dplyr introduces a new operator called a pipe: %>%

select:
```{r}
ncol(gap) # returns 6
names(gap) # retuns column names
subset_gap <- gap %>% select(year, country, gdpPercap)
subset_gap
```

filter:
```{r}
# Note that it may be easier to read from the bottom up
yr_country_gdp_euro <- gap %>%       # save into new variable
  filter(continent == "Europe") %>%  # which rows we want
  select(year, country, gdpPercap)   # which columns we want
yr_country_gdp_euro

lifeExp_africa_07 <- gap %>%
  filter(continent == "Africa", year == 2007) %>%
  select(year, country, lifeExp)
dim(lifeExp_africa_07)
```

group_by() - subset data frame
Turns a single data frame into a set of data frames based on some variable
```{r}
head(gap)
str(gap)

gap_by_continent <- gap %>% group_by(continent)
head(gap_by_continent)
str(gap_by_continent)

gdp_by_continent <- gap_by_continent %>%
  summarize(mean_gdpPercap = mean(gdpPercap))
```

Exercise
```{r}
# Avg life expectancy across all african countris by year. In how many years did avg life expectancy decrease?

african_le_yrs <- gap %>%
  filter(continent == "Africa") %>%
  group_by(year) %>%
  summarize(avg_life = mean(lifeExp))
View(african_le_yrs)

life_exp_dec = 0
nyears <- nrow(african_le_yrs)
nyears <- nyears - 1
for (row in 1:nyears) {
  exp_this_yr = african_le_yrs[row, 2]
  exp_next_yr = african_le_yrs[row + 1, 2]
  if ( exp_next_yr < exp_this_yr ) {
    life_exp_dec <- life_exp_dec + 1
  }
}

print(life_exp_dec)
```

Calculate average life expectancy per country. Which has the longest and shortest average life expectancy?

```{r}
le_by_country <- gap %>%
  group_by(country) %>%
  summarize(avg_le = mean(lifeExp))

le_by_country %>% filter(avg_le == min(avg_le))
le_by_country %>% filter(avg_le == max(avg_le))
```

mutate() is a function in dplyr that lets you add a column to a dataframe based on the data.
```{r}
bill_gdp <- gap %>%
  filter(year == 2007) %>%
  mutate(billion_gdp = gdpPercap * pop / 10^9) %>%
  select(country, continent, billion_gdp)

colnames(bill_gdp)
dim(bill_gdp)
```

Many of these functions can handle several arguments at once

```{r}
gdp_by_continent <- gap %>%
  group_by(continent, year) %>%
  summarize(mean_gdp = mean(gdpPercap),
            sd_gdp = sd(gdpPercap),
            mean_pop = mean(pop),
            sample_size = n(),  # n() counts sample size of the vars grouped by
            se_gdp = sd_gdp / sqrt(sample_size))

#View(gdp_by_continent)

str(gdp_by_continent)
```

use function data.frame() to recombine data frames created by group_by back into a single data frame. Function ungroup() works the same way
```{r}
gdp_by_continent <- data.frame(gdp_by_continent)
gdp_by_continent <- ungroup(gdp_by_continent)
str(gdp_by_continent)
```

### Combining dplyr with ggplot2
```{r}
library(ggplot2)

gap %>% filter(continent == "Asia") %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point()
```

Wide and Long
```{r}
gap_wide <- read.csv("data/gapminder_wide.csv", stringsAsFactors = FALSE)
dim(gap_wide)
colnames(gap_wide)
```

This data is in a wide format. We want it to be in a long format.

```{r}
head(gap_wide)

# To make the data long, we are going to use gather()
gap_long <- gap_wide %>%
  gather(obstype_year, obs_values, -continent, -country) # minus continent/country because we want to keep those where they are

dim(gap_long)

# The way this is formatted will make filtering by year really hard.
# Use separate to pull year out
gap_long <- gap_long %>%
  separate(obstype_year, into = c("obs_type", "year"), sep = "_")
head(gap_long)

# transform years into integers instead of strings
gap_long$year <- as.integer(gap_long$year)
head(gap_long)
```

```{r}
gap_long %>%
  group_by(continent) %>%
  filter(obs_type == "lifeExp") %>%
  summarize(avg_life_exp=mean(obs_values))
```

Opposite of gather is spread

```{r}
str(gap_long)
gap_normal <- gap_long %>% spread(obs_type, obs_values)
str(gap_normal)
colnames(gap_normal)
colnames(gap)

# rearrange column names
gap_normal <- gap_normal[, names(gap)]
colnames(gap_normal)

# all equal
all.equal(gap, gap_normal) # not the same; different data types
```
```{r}
gap_normal <- gap_normal %>%
  s
str(gap_normal)
str(gap)
```
	---
	title: "R Notebook"
	output: html_notebook
	---


	```{r}
	library(dplyr)
	library(gapminder)
	data("gapminder")
	attach(gapminder)

	head(gapminder)
	```
	Gapminder data loaded. Next: manipulation.
	```{r}
	# Shortening variable name and removing old variable.
	gap <- gapminder
	rm(gapminder)

	mean.gdp.africa <- mean(gap$gdpPercap[continent == "Africa"])
	mean.gdp.africa
	mean.gdp.asia <- mean(gap$gdpPercap[continent == "Asia"])
	mean.gdp.asia
	mean.gdp.americas <- mean(gap$gdpPercap[continent == "Americas"])
	mean.gdp.americas

	#mean(gap[gap$continent == "Africa", "gdpPercap"])

	```

	### Working with data using dplyr
	select() - subset variables (columns)
	filter() - subset observations (rows)
	group_by() - counts and groups data
	summarize() - useful information about data
	mutate() - add a new column based on the data

	dplyr introduces a new operator called a pipe: %>%

	select:
	```{r}
	ncol(gap) # returns 6
	names(gap) # retuns column names
	subset_gap <- gap %>% select(year, country, gdpPercap)
	subset_gap
	```

	filter:
	```{r}
	# Note that it may be easier to read from the bottom up
	yr_country_gdp_euro <- gap %>% # save into new variable
	filter(continent == "Europe") %>% # which rows we want
	select(year, country, gdpPercap) # which columns we want
	yr_country_gdp_euro

	lifeExp_africa_07 <- gap %>%
	filter(continent == "Africa", year == 2007) %>%
	select(year, country, lifeExp)
	dim(lifeExp_africa_07)
	```

	group_by() - subset data frame
	Turns a single data frame into a set of data frames based on some variable
	```{r}
	head(gap)
	str(gap)

	gap_by_continent <- gap %>% group_by(continent)
	head(gap_by_continent)
	str(gap_by_continent)

	gdp_by_continent <- gap_by_continent %>%
	summarize(mean_gdpPercap = mean(gdpPercap))
	```

	Exercise
	```{r}
	# Avg life expectancy across all african countris by year. In how many years did avg life expectancy decrease?

	african_le_yrs <- gap %>%
	filter(continent == "Africa") %>%
	group_by(year) %>%
	summarize(avg_life = mean(lifeExp))
	View(african_le_yrs)

	life_exp_dec = 0
	nyears <- nrow(african_le_yrs)
	nyears <- nyears - 1
	for (row in 1:nyears) {
	exp_this_yr = african_le_yrs[row, 2]
	exp_next_yr = african_le_yrs[row + 1, 2]
	if ( exp_next_yr < exp_this_yr ) {
	life_exp_dec <- life_exp_dec + 1
	}
	}

	print(life_exp_dec)
	```

	Calculate average life expectancy per country. Which has the longest and shortest average life expectancy?

	```{r}
	le_by_country <- gap %>%
	group_by(country) %>%
	summarize(avg_le = mean(lifeExp))

	le_by_country %>% filter(avg_le == min(avg_le))
	le_by_country %>% filter(avg_le == max(avg_le))
	```

	mutate() is a function in dplyr that lets you add a column to a dataframe based on the data.
	```{r}
	bill_gdp <- gap %>%
	filter(year == 2007) %>%
	mutate(billion_gdp = gdpPercap * pop / 10^9) %>%
	select(country, continent, billion_gdp)

	colnames(bill_gdp)
	dim(bill_gdp)
	```

	Many of these functions can handle several arguments at once

	```{r}
	gdp_by_continent <- gap %>%
	group_by(continent, year) %>%
	summarize(mean_gdp = mean(gdpPercap),
	sd_gdp = sd(gdpPercap),
	mean_pop = mean(pop),
	sample_size = n(), # n() counts sample size of the vars grouped by
	se_gdp = sd_gdp / sqrt(sample_size))

	#View(gdp_by_continent)

	str(gdp_by_continent)
	```

	use function data.frame() to recombine data frames created by group_by back into a single data frame. Function ungroup() works the same way
	```{r}
	gdp_by_continent <- data.frame(gdp_by_continent)
	gdp_by_continent <- ungroup(gdp_by_continent)
	str(gdp_by_continent)
	```

	### Combining dplyr with ggplot2
	```{r}
	library(ggplot2)

	gap %>% filter(continent == "Asia") %>%
	ggplot(aes(x = gdpPercap, y = lifeExp)) +
	geom_point()
	```

	Wide and Long
	```{r}
	gap_wide <- read.csv("data/gapminder_wide.csv", stringsAsFactors = FALSE)
	dim(gap_wide)
	colnames(gap_wide)
	```

	This data is in a wide format. We want it to be in a long format.

	```{r}
	head(gap_wide)

	# To make the data long, we are going to use gather()
	gap_long <- gap_wide %>%
	gather(obstype_year, obs_values, -continent, -country) # minus continent/country because we want to keep those where they are

	dim(gap_long)

	# The way this is formatted will make filtering by year really hard.
	# Use separate to pull year out
	gap_long <- gap_long %>%
	separate(obstype_year, into = c("obs_type", "year"), sep = "_")
	head(gap_long)

	# transform years into integers instead of strings
	gap_long$year <- as.integer(gap_long$year)
	head(gap_long)
	```

	```{r}
	gap_long %>%
	group_by(continent) %>%
	filter(obs_type == "lifeExp") %>%
	summarize(avg_life_exp=mean(obs_values))
	```

	Opposite of gather is spread

	```{r}
	str(gap_long)
	gap_normal <- gap_long %>% spread(obs_type, obs_values)
	str(gap_normal)
	colnames(gap_normal)
	colnames(gap)

	# rearrange column names
	gap_normal <- gap_normal[, names(gap)]
	colnames(gap_normal)

	# all equal
	all.equal(gap, gap_normal) # not the same; different data types
	```
	```{r}
	gap_normal <- gap_normal %>%
	s
	str(gap_normal)
	str(gap)
	```